| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "like.h" |
| |
| #include <fmt/format.h> |
| #include <hs/hs_compile.h> |
| #include <re2/stringpiece.h> |
| |
| #include <cstddef> |
| #include <ostream> |
| #include <utility> |
| #include <vector> |
| |
| #include "common/logging.h" |
| #include "vec/columns/column.h" |
| #include "vec/columns/column_const.h" |
| #include "vec/columns/column_vector.h" |
| #include "vec/common/string_ref.h" |
| #include "vec/core/block.h" |
| #include "vec/core/column_with_type_and_name.h" |
| #include "vec/functions/simple_function_factory.h" |
| |
| namespace doris::vectorized { |
| // A regex to match any regex pattern is equivalent to a substring search. |
| static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); |
| |
| // A regex to match any regex pattern which is equivalent to matching a constant string |
| // at the end of the string values. |
| static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); |
| |
| // A regex to match any regex pattern which is equivalent to matching a constant string |
| // at the end of the string values. |
| static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); |
| |
| // A regex to match any regex pattern which is equivalent to a constant string match. |
| static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); |
| // A regex to match .* |
| static const RE2 ALLPASS_RE(R"((\.\*)+)"); |
| |
| // Like patterns |
| static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))"); |
| static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)"); |
| static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))"); |
| static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)"); |
| static const re2::RE2 LIKE_ALLPASS_RE("%+"); |
| |
| struct VectorAllpassSearchState : public VectorPatternSearchState { |
| VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {} |
| |
| ~VectorAllpassSearchState() override = default; |
| |
| void like_pattern_match(const std::string& pattern_str) override { |
| if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) { |
| _search_strings->insert_default(); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| |
| void regexp_pattern_match(const std::string& pattern_str) override { |
| if (RE2::FullMatch(pattern_str, ALLPASS_RE)) { |
| _search_strings->insert_default(); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| }; |
| |
| struct VectorEqualSearchState : public VectorPatternSearchState { |
| VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {} |
| |
| ~VectorEqualSearchState() override = default; |
| |
| void like_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) { |
| FunctionLike::remove_escape_character(&_search_string); |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| |
| void regexp_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) { |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| }; |
| |
| struct VectorSubStringSearchState : public VectorPatternSearchState { |
| VectorSubStringSearchState() |
| : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {} |
| |
| ~VectorSubStringSearchState() override = default; |
| |
| void like_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) { |
| FunctionLike::remove_escape_character(&_search_string); |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| |
| void regexp_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) { |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| }; |
| |
| struct VectorStartsWithSearchState : public VectorPatternSearchState { |
| VectorStartsWithSearchState() |
| : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {} |
| |
| ~VectorStartsWithSearchState() override = default; |
| |
| void like_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) { |
| FunctionLike::remove_escape_character(&_search_string); |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| |
| void regexp_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) { |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| }; |
| |
| struct VectorEndsWithSearchState : public VectorPatternSearchState { |
| VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {} |
| |
| ~VectorEndsWithSearchState() override = default; |
| |
| void like_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) { |
| FunctionLike::remove_escape_character(&_search_string); |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| |
| void regexp_pattern_match(const std::string& pattern_str) override { |
| _search_string.clear(); |
| if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) { |
| _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
| } else { |
| _pattern_matched = false; |
| } |
| } |
| }; |
| |
| Status LikeSearchState::clone(LikeSearchState& cloned) { |
| cloned.escape_char = escape_char; |
| cloned.set_search_string(search_string); |
| |
| std::string re_pattern; |
| FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern); |
| if (hs_database) { // use hyperscan |
| hs_database_t* database = nullptr; |
| hs_scratch_t* scratch = nullptr; |
| RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); |
| |
| cloned.hs_database.reset(database); |
| cloned.hs_scratch.reset(scratch); |
| } else { // fallback to re2 |
| cloned.hs_database.reset(); |
| cloned.hs_scratch.reset(); |
| |
| RE2::Options opts; |
| opts.set_never_nl(false); |
| opts.set_dot_nl(true); |
| cloned.regex = std::make_unique<RE2>(re_pattern, opts); |
| if (!cloned.regex->ok()) { |
| return Status::InternalError("Invalid regex expression: {}", re_pattern); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_allpass_fn(LikeSearchState* state, const ColumnString& vals, |
| const StringRef& pattern, |
| ColumnUInt8::Container& result) { |
| memset(result.data(), 1, vals.size()); |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_allpass_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, |
| unsigned char* result) { |
| *result = 1; |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals, |
| const ColumnString& search_strings, |
| ColumnUInt8::Container& result) { |
| DCHECK(vals.size() == search_strings.size()); |
| DCHECK(vals.size() == result.size()); |
| memset(result.data(), 1, vals.size()); |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_starts_with_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, |
| ColumnUInt8::Container& result) { |
| auto sz = val.size(); |
| for (size_t i = 0; i < sz; i++) { |
| const auto& str_ref = val.get_data_at(i); |
| result[i] = (str_ref.size >= state->search_string_sv.size) && |
| str_ref.start_with(state->search_string_sv); |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_starts_with_fn_scalar(LikeSearchState* state, |
| const StringRef& val, |
| const StringRef& pattern, |
| unsigned char* result) { |
| *result = (val.size >= state->search_string_sv.size) && |
| (state->search_string_sv == val.substring(0, state->search_string_sv.size)); |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals, |
| const ColumnString& search_strings, |
| ColumnUInt8::Container& result) { |
| DCHECK(vals.size() == search_strings.size()); |
| DCHECK(vals.size() == result.size()); |
| auto sz = vals.size(); |
| for (size_t i = 0; i < sz; ++i) { |
| const auto& str_sv = vals.get_data_at(i); |
| const auto& search_string_sv = search_strings.get_data_at(i); |
| result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv); |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_ends_with_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, |
| ColumnUInt8::Container& result) { |
| auto sz = val.size(); |
| for (size_t i = 0; i < sz; i++) { |
| const auto& str_ref = val.get_data_at(i); |
| result[i] = (str_ref.size >= state->search_string_sv.size) && |
| str_ref.end_with(state->search_string_sv); |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_ends_with_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, |
| unsigned char* result) { |
| *result = (val.size >= state->search_string_sv.size) && |
| (state->search_string_sv == val.substring(val.size - state->search_string_sv.size, |
| state->search_string_sv.size)); |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals, |
| const ColumnString& search_strings, |
| ColumnUInt8::Container& result) { |
| DCHECK(vals.size() == search_strings.size()); |
| DCHECK(vals.size() == result.size()); |
| auto sz = vals.size(); |
| for (size_t i = 0; i < sz; ++i) { |
| const auto& str_sv = vals.get_data_at(i); |
| const auto& search_string_sv = search_strings.get_data_at(i); |
| result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv); |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_equals_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, |
| ColumnUInt8::Container& result) { |
| auto sz = val.size(); |
| for (size_t i = 0; i < sz; i++) { |
| result[i] = (val.get_data_at(i) == state->search_string_sv); |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_equals_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, |
| unsigned char* result) { |
| *result = (val == state->search_string_sv); |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals, |
| const ColumnString& search_strings, |
| ColumnUInt8::Container& result) { |
| DCHECK(vals.size() == search_strings.size()); |
| DCHECK(vals.size() == result.size()); |
| auto sz = vals.size(); |
| for (size_t i = 0; i < sz; ++i) { |
| const auto& str_sv = vals.get_data_at(i); |
| const auto& search_string_sv = search_strings.get_data_at(i); |
| result[i] = str_sv == search_string_sv; |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, |
| ColumnUInt8::Container& result) { |
| auto sz = val.size(); |
| for (size_t i = 0; i < sz; i++) { |
| if (state->search_string_sv.size == 0) { |
| result[i] = true; |
| continue; |
| } |
| result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1; |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_substring_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, |
| unsigned char* result) { |
| if (state->search_string_sv.size == 0) { |
| *result = true; |
| return Status::OK(); |
| } |
| *result = state->substring_pattern.search(val) != -1; |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals, |
| const ColumnString& search_strings, |
| ColumnUInt8::Container& result) { |
| DCHECK(vals.size() == search_strings.size()); |
| DCHECK(vals.size() == result.size()); |
| auto sz = vals.size(); |
| for (size_t i = 0; i < sz; ++i) { |
| const auto& str_sv = vals.get_data_at(i); |
| const auto& search_string_sv = search_strings.get_data_at(i); |
| if (search_string_sv.size == 0) { |
| result[i] = true; |
| continue; |
| } |
| doris::StringSearch substring_search(&search_string_sv); |
| result[i] = substring_search.search(str_sv) != -1; |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, unsigned char* result) { |
| if (state->hs_database) { // use hyperscan |
| auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, state->hs_scratch.get(), |
| doris::vectorized::LikeSearchState::hs_match_handler, (void*)result); |
| if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { |
| return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); |
| } |
| } else { // fallback to re2 |
| *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex); |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, unsigned char* result) { |
| RE2::Options opts; |
| opts.set_never_nl(false); |
| opts.set_dot_nl(true); |
| re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts); |
| if (re.ok()) { |
| *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re); |
| } else { |
| return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, |
| ColumnUInt8::Container& result) { |
| auto sz = val.size(); |
| if (state->hs_database) { // use hyperscan |
| for (size_t i = 0; i < sz; i++) { |
| const auto& str_ref = val.get_data_at(i); |
| auto ret = hs_scan(state->hs_database.get(), str_ref.data, str_ref.size, 0, |
| state->hs_scratch.get(), |
| doris::vectorized::LikeSearchState::hs_match_handler, |
| (void*)(result.data() + i)); |
| if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { |
| return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); |
| } |
| } |
| } else { // fallback to re2 |
| for (size_t i = 0; i < sz; i++) { |
| const auto& str_ref = val.get_data_at(i); |
| *(result.data() + i) = |
| RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, ColumnUInt8::Container& result) { |
| std::string re_pattern(pattern.data, pattern.size); |
| |
| hs_database_t* database = nullptr; |
| hs_scratch_t* scratch = nullptr; |
| if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan |
| auto sz = val.size(); |
| for (size_t i = 0; i < sz; i++) { |
| const auto& str_ref = val.get_data_at(i); |
| auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, scratch, |
| doris::vectorized::LikeSearchState::hs_match_handler, |
| (void*)(result.data() + i)); |
| if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { |
| return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); |
| } |
| } |
| |
| hs_free_scratch(scratch); |
| hs_free_database(database); |
| } else { // fallback to re2 |
| RE2::Options opts; |
| opts.set_never_nl(false); |
| opts.set_dot_nl(true); |
| re2::RE2 re(re_pattern, opts); |
| if (re.ok()) { |
| auto sz = val.size(); |
| for (size_t i = 0; i < sz; i++) { |
| const auto& str_ref = val.get_data_at(i); |
| *(result.data() + i) = |
| RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re); |
| } |
| } else { |
| return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| // hyperscan compile expression to database and allocate scratch space |
| Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression, |
| hs_database_t** database, hs_scratch_t** scratch) { |
| hs_compile_error_t* compile_err; |
| auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8, |
| HS_MODE_BLOCK, nullptr, database, &compile_err); |
| |
| if (res != HS_SUCCESS) { |
| *database = nullptr; |
| std::string error_message = compile_err->message; |
| hs_free_compile_error(compile_err); |
| // Do not call FunctionContext::set_error here, since we do not want to cancel the query here. |
| return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message); |
| } |
| hs_free_compile_error(compile_err); |
| |
| if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) { |
| hs_free_database(*database); |
| *database = nullptr; |
| *scratch = nullptr; |
| // Do not call FunctionContext::set_error here, since we do not want to cancel the query here. |
| return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error"); |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block, |
| const ColumnNumbers& arguments, uint32_t result, |
| size_t input_rows_count) const { |
| const auto values_col = |
| block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); |
| const auto* values = check_and_get_column<ColumnString>(values_col.get()); |
| |
| if (!values) { |
| return Status::InternalError("Not supported input arguments types"); |
| } |
| // result column |
| auto res = ColumnUInt8::create(); |
| ColumnUInt8::Container& vec_res = res->get_data(); |
| // set default value to 0, and match functions only need to set 1/true |
| vec_res.resize_fill(input_rows_count); |
| auto* state = reinterpret_cast<LikeState*>( |
| context->get_function_state(FunctionContext::THREAD_LOCAL)); |
| // for constant_substring_fn, use long run length search for performance |
| if (constant_substring_fn == |
| *(state->function.target<doris::Status (*)(LikeSearchState* state, const ColumnString&, |
| const StringRef&, ColumnUInt8::Container&)>())) { |
| RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res, |
| &state->search_state)); |
| } else { |
| const auto pattern_col = block.get_by_position(arguments[1]).column; |
| if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) { |
| RETURN_IF_ERROR( |
| vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count)); |
| } else if (const auto* const_patterns = |
| check_and_get_column<ColumnConst>(pattern_col.get())) { |
| const auto& pattern_val = const_patterns->get_data_at(0); |
| RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function, |
| &state->search_state)); |
| } else { |
| return Status::InternalError("Not supported input arguments types"); |
| } |
| } |
| block.replace_by_position(result, std::move(res)); |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, |
| const ColumnString::Offsets& value_offsets, |
| ColumnUInt8::Container& result, |
| LikeSearchState* search_state) const { |
| // treat continuous multi string data as a long string data |
| const UInt8* begin = values.data(); |
| const UInt8* end = begin + values.size(); |
| const UInt8* pos = begin; |
| |
| /// Current index in the array of strings. |
| size_t i = 0; |
| size_t needle_size = search_state->substring_pattern.get_pattern_length(); |
| |
| /// We will search for the next occurrence in all strings at once. |
| while (pos < end) { |
| // search return matched substring start offset |
| pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos); |
| if (pos >= end) { |
| break; |
| } |
| |
| /// Determine which index it refers to. |
| /// begin + value_offsets[i] is the start offset of string at i+1 |
| while (i < value_offsets.size() && begin + value_offsets[i] < pos) { |
| ++i; |
| } |
| |
| /// We check that the entry does not pass through the boundaries of strings. |
| if (pos + needle_size <= begin + value_offsets[i]) { |
| result[i] = 1; |
| } |
| |
| // move to next string offset |
| pos = begin + value_offsets[i]; |
| ++i; |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val, |
| ColumnUInt8::Container& result, const LikeFn& function, |
| LikeSearchState* search_state) const { |
| RETURN_IF_ERROR((function)(search_state, values, |
| *reinterpret_cast<const StringRef*>(pattern_val), result)); |
| return Status::OK(); |
| } |
| |
| template <bool LIKE_PATTERN> |
| VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) { |
| VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>(); |
| VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>(); |
| VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>(); |
| VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>(); |
| VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>(); |
| size_t size = patterns.size(); |
| |
| for (size_t i = 0; i < size; ++i) { |
| if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched && |
| !substring_state->_pattern_matched && !starts_with_state->_pattern_matched && |
| !ends_with_state->_pattern_matched) { |
| return nullptr; |
| } |
| std::string pattern_str = patterns.get_data_at(i).to_string(); |
| if (allpass_state->_pattern_matched) { |
| if constexpr (LIKE_PATTERN) { |
| allpass_state->like_pattern_match(pattern_str); |
| } else { |
| allpass_state->regexp_pattern_match(pattern_str); |
| } |
| } |
| if (equal_state->_pattern_matched) { |
| if constexpr (LIKE_PATTERN) { |
| equal_state->like_pattern_match(pattern_str); |
| } else { |
| equal_state->regexp_pattern_match(pattern_str); |
| } |
| } |
| if (substring_state->_pattern_matched) { |
| if constexpr (LIKE_PATTERN) { |
| substring_state->like_pattern_match(pattern_str); |
| } else { |
| substring_state->regexp_pattern_match(pattern_str); |
| } |
| } |
| if (starts_with_state->_pattern_matched) { |
| if constexpr (LIKE_PATTERN) { |
| starts_with_state->like_pattern_match(pattern_str); |
| } else { |
| starts_with_state->regexp_pattern_match(pattern_str); |
| } |
| } |
| if (ends_with_state->_pattern_matched) { |
| if constexpr (LIKE_PATTERN) { |
| ends_with_state->like_pattern_match(pattern_str); |
| } else { |
| ends_with_state->regexp_pattern_match(pattern_str); |
| } |
| } |
| } |
| |
| if (allpass_state->_pattern_matched) { |
| return allpass_state; |
| } else if (equal_state->_pattern_matched) { |
| return equal_state; |
| } else if (substring_state->_pattern_matched) { |
| return substring_state; |
| } else if (starts_with_state->_pattern_matched) { |
| return starts_with_state; |
| } else if (ends_with_state->_pattern_matched) { |
| return ends_with_state; |
| } else { |
| return nullptr; |
| } |
| } |
| |
| Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns, |
| ColumnUInt8::Container& result, LikeState* state, |
| size_t input_rows_count) const { |
| VPatternSearchStateSPtr vector_search_state; |
| if (state->is_like_pattern) { |
| vector_search_state = pattern_type_recognition<true>(patterns); |
| } else { |
| vector_search_state = pattern_type_recognition<false>(patterns); |
| } |
| if (vector_search_state == nullptr) { |
| // pattern type recognition failed, use default case |
| for (int i = 0; i < input_rows_count; ++i) { |
| const auto pattern_val = patterns.get_data_at(i); |
| const auto value_val = values.get_data_at(i); |
| RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val, |
| &result[i])); |
| } |
| return Status::OK(); |
| } |
| const auto* search_strings = |
| static_cast<const ColumnString*>(vector_search_state->_search_strings.get()); |
| return (vector_search_state->_vector_function)(values, *search_strings, result); |
| } |
| |
| Status FunctionLike::like_fn(LikeSearchState* state, const ColumnString& val, |
| const StringRef& pattern, ColumnUInt8::Container& result) { |
| std::string re_pattern; |
| convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern); |
| return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result); |
| } |
| |
| Status FunctionLike::like_fn_scalar(LikeSearchState* state, const StringRef& val, |
| const StringRef& pattern, unsigned char* result) { |
| std::string re_pattern; |
| convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern); |
| |
| return regexp_fn_scalar(state, StringRef(val.data, val.size), |
| {re_pattern.c_str(), re_pattern.size()}, result); |
| } |
| |
| void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern, |
| std::string* re_pattern) { |
| re_pattern->clear(); |
| |
| if (pattern.empty()) { |
| re_pattern->append("^$"); |
| return; |
| } |
| |
| // add ^ to pattern head to match line head |
| if (!pattern.empty() && pattern[0] != '%') { |
| re_pattern->append("^"); |
| } |
| |
| bool is_escaped = false; |
| // expect % and _, all chars should keep it literal means. |
| for (char i : pattern) { |
| if (is_escaped) { // last is \, this should be escape |
| if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || i == '-' || |
| i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i == ':' || i == '^' || |
| i == '.' || i == '$' || i == '?') { |
| re_pattern->append(1, '\\'); |
| } else if (i != '%' && i != '_') { |
| re_pattern->append(2, '\\'); |
| } |
| re_pattern->append(1, i); |
| is_escaped = false; |
| } else { |
| switch (i) { |
| case '%': |
| re_pattern->append(".*"); |
| break; |
| case '_': |
| re_pattern->append("."); |
| break; |
| default: |
| is_escaped = i == state->escape_char; |
| if (!is_escaped) { |
| // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ? |
| if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || |
| i == '-' || i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || |
| i == ':' || i == '^' || i == '.' || i == '$' || i == '?') { |
| re_pattern->append(1, '\\'); |
| } |
| re_pattern->append(1, i); |
| } |
| break; |
| } |
| } |
| } |
| |
| // add $ to pattern tail to match line tail |
| if (!pattern.empty() && re_pattern->back() != '*') { |
| re_pattern->append("$"); |
| } |
| } |
| |
| void FunctionLike::remove_escape_character(std::string* search_string) { |
| std::string tmp_search_string; |
| tmp_search_string.swap(*search_string); |
| int len = tmp_search_string.length(); |
| for (int i = 0; i < len;) { |
| if (tmp_search_string[i] == '\\' && i + 1 < len && |
| (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' || |
| tmp_search_string[i + 1] == '\\')) { |
| search_string->append(1, tmp_search_string[i + 1]); |
| i += 2; |
| } else { |
| search_string->append(1, tmp_search_string[i]); |
| i++; |
| } |
| } |
| } |
| |
| bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) { |
| if (!re.ok()) { |
| return false; |
| } |
| |
| std::vector<RE2::Arg> arguments; |
| std::vector<RE2::Arg*> arguments_ptrs; |
| std::size_t args_count = re.NumberOfCapturingGroups(); |
| arguments.resize(args_count); |
| arguments_ptrs.resize(args_count); |
| results.resize(args_count); |
| for (std::size_t i = 0; i < args_count; ++i) { |
| arguments[i] = &results[i]; |
| arguments_ptrs[i] = &arguments[i]; |
| } |
| |
| return RE2::FullMatchN(str, re, arguments_ptrs.data(), args_count); |
| } |
| |
| void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) { |
| std::vector<std::string> results; |
| VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name |
| << ": " << re.pattern() << ", size: " << re.pattern().size(); |
| if (re2_full_match(str, re, results)) { |
| for (int i = 0; i < results.size(); ++i) { |
| VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size(); |
| } |
| } else { |
| VLOG_DEBUG << "no match"; |
| } |
| } |
| |
| Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern, |
| std::shared_ptr<LikeState>& state, |
| bool try_hyperscan) { |
| std::string pattern_str = pattern.to_string(); |
| state->search_state.pattern_str = pattern_str; |
| std::string search_string; |
| |
| if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) { |
| state->search_state.set_search_string(""); |
| state->function = constant_allpass_fn; |
| state->scalar_function = constant_allpass_fn_scalar; |
| } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) { |
| if (VLOG_DEBUG_IS_ON) { |
| verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE); |
| VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
| } |
| remove_escape_character(&search_string); |
| if (VLOG_DEBUG_IS_ON) { |
| VLOG_DEBUG << "search_string escape removed: " << search_string |
| << ", size: " << search_string.size(); |
| } |
| state->search_state.set_search_string(search_string); |
| state->function = constant_equals_fn; |
| state->scalar_function = constant_equals_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) { |
| if (VLOG_DEBUG_IS_ON) { |
| verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE); |
| VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
| } |
| remove_escape_character(&search_string); |
| if (VLOG_DEBUG_IS_ON) { |
| VLOG_DEBUG << "search_string escape removed: " << search_string |
| << ", size: " << search_string.size(); |
| } |
| state->search_state.set_search_string(search_string); |
| state->function = constant_starts_with_fn; |
| state->scalar_function = constant_starts_with_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) { |
| if (VLOG_DEBUG_IS_ON) { |
| verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE); |
| VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
| } |
| remove_escape_character(&search_string); |
| if (VLOG_DEBUG_IS_ON) { |
| VLOG_DEBUG << "search_string escape removed: " << search_string |
| << ", size: " << search_string.size(); |
| } |
| state->search_state.set_search_string(search_string); |
| state->function = constant_ends_with_fn; |
| state->scalar_function = constant_ends_with_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) { |
| if (VLOG_DEBUG_IS_ON) { |
| verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE); |
| VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
| } |
| remove_escape_character(&search_string); |
| if (VLOG_DEBUG_IS_ON) { |
| VLOG_DEBUG << "search_string escape removed: " << search_string |
| << ", size: " << search_string.size(); |
| } |
| state->search_state.set_search_string(search_string); |
| state->function = constant_substring_fn; |
| state->scalar_function = constant_substring_fn_scalar; |
| } else { |
| std::string re_pattern; |
| convert_like_pattern(&state->search_state, pattern_str, &re_pattern); |
| if (VLOG_DEBUG_IS_ON) { |
| VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str |
| << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern |
| << ", size: " << re_pattern.size(); |
| } |
| |
| hs_database_t* database = nullptr; |
| hs_scratch_t* scratch = nullptr; |
| if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) { |
| // use hyperscan |
| state->search_state.hs_database.reset(database); |
| state->search_state.hs_scratch.reset(scratch); |
| } else { |
| // fallback to re2 |
| // reset hs_database to nullptr to indicate not use hyperscan |
| state->search_state.hs_database.reset(); |
| state->search_state.hs_scratch.reset(); |
| |
| RE2::Options opts; |
| opts.set_never_nl(false); |
| opts.set_dot_nl(true); |
| state->search_state.regex = std::make_unique<RE2>(re_pattern, opts); |
| if (!state->search_state.regex->ok()) { |
| return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern, |
| pattern_str); |
| } |
| } |
| |
| state->function = constant_regex_fn; |
| state->scalar_function = constant_regex_fn_scalar; |
| } |
| return Status::OK(); |
| } |
| |
| Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) { |
| if (scope != FunctionContext::THREAD_LOCAL) { |
| return Status::OK(); |
| } |
| std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); |
| state->is_like_pattern = true; |
| state->function = like_fn; |
| state->scalar_function = like_fn_scalar; |
| if (context->is_col_constant(1)) { |
| const auto pattern_col = context->get_constant_col(1)->column_ptr; |
| const auto& pattern = pattern_col->get_data_at(0); |
| RETURN_IF_ERROR(construct_like_const_state(context, pattern, state)); |
| } |
| context->set_function_state(scope, state); |
| |
| return Status::OK(); |
| } |
| |
| Status FunctionRegexpLike::open(FunctionContext* context, |
| FunctionContext::FunctionStateScope scope) { |
| if (scope != FunctionContext::THREAD_LOCAL) { |
| return Status::OK(); |
| } |
| std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); |
| context->set_function_state(scope, state); |
| state->is_like_pattern = false; |
| state->function = regexp_fn; |
| state->scalar_function = regexp_fn_scalar; |
| if (context->is_col_constant(1)) { |
| const auto pattern_col = context->get_constant_col(1)->column_ptr; |
| const auto& pattern = pattern_col->get_data_at(0); |
| |
| std::string pattern_str = pattern.to_string(); |
| std::string search_string; |
| if (RE2::FullMatch(pattern_str, ALLPASS_RE)) { |
| state->search_state.set_search_string(""); |
| state->function = constant_allpass_fn; |
| state->scalar_function = constant_allpass_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { |
| state->search_state.set_search_string(search_string); |
| state->function = constant_equals_fn; |
| state->scalar_function = constant_equals_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { |
| state->search_state.set_search_string(search_string); |
| state->function = constant_starts_with_fn; |
| state->scalar_function = constant_starts_with_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { |
| state->search_state.set_search_string(search_string); |
| state->function = constant_ends_with_fn; |
| state->scalar_function = constant_ends_with_fn_scalar; |
| } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { |
| state->search_state.set_search_string(search_string); |
| state->function = constant_substring_fn; |
| state->scalar_function = constant_substring_fn_scalar; |
| } else { |
| hs_database_t* database = nullptr; |
| hs_scratch_t* scratch = nullptr; |
| if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) { |
| // use hyperscan |
| state->search_state.hs_database.reset(database); |
| state->search_state.hs_scratch.reset(scratch); |
| } else { |
| // fallback to re2 |
| // reset hs_database to nullptr to indicate not use hyperscan |
| state->search_state.hs_database.reset(); |
| state->search_state.hs_scratch.reset(); |
| RE2::Options opts; |
| opts.set_never_nl(false); |
| opts.set_dot_nl(true); |
| state->search_state.regex = std::make_unique<RE2>(pattern_str, opts); |
| if (!state->search_state.regex->ok()) { |
| return Status::InternalError("Invalid regex expression: {}", pattern_str); |
| } |
| } |
| state->function = constant_regex_fn; |
| state->scalar_function = constant_regex_fn_scalar; |
| } |
| } |
| return Status::OK(); |
| } |
| |
| void register_function_like(SimpleFunctionFactory& factory) { |
| factory.register_function<FunctionLike>(); |
| } |
| |
| void register_function_regexp(SimpleFunctionFactory& factory) { |
| factory.register_function<FunctionRegexpLike>(); |
| factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias); |
| } |
| |
| } // namespace doris::vectorized |