blob: f479f12fab93fca6b254e016a86a9bb4285d395d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exprs/like-predicate.h"
#include <string.h>
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <sstream>
#include "gutil/strings/substitute.h"
#include "runtime/string-value.inline.h"
#include "string-functions.h"
#include "common/names.h"
using namespace impala_udf;
using namespace re2;
namespace impala {
// A regex to match any regex pattern is equivalent to a substring search.
static const RE2 SUBSTRING_RE(
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 ENDS_WITH_RE(
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 STARTS_WITH_RE(
"\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
// A regex to match any regex pattern which is equivalent to a constant string match.
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
void LikePredicate::LikePrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
LikePrepareInternal(context, scope, true);
}
void LikePredicate::ILikePrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
LikePrepareInternal(context, scope, false);
}
// TODO: make class StringValue and StringSearch accept a case-sensitive flag and
// switch back to using the cheaper Constant<>() functions.
void LikePredicate::LikePrepareInternal(FunctionContext* context,
FunctionContext::FunctionStateScope scope, bool case_sensitive) {
if (scope != FunctionContext::THREAD_LOCAL) return;
LikePredicateState* state = new LikePredicateState();
state->function_ = LikeFn;
context->SetFunctionState(scope, state);
if (context->IsArgConstant(1)) {
StringVal pattern_val = *reinterpret_cast<StringVal*>(context->GetConstantArg(1));
if (pattern_val.is_null) return;
StringValue pattern = StringValue::FromStringVal(pattern_val);
re2::RE2 substring_re("(?:%+)([^%_]*)(?:%+)");
re2::RE2 ends_with_re("(?:%+)([^%_]*)");
re2::RE2 starts_with_re("([^%_]*)(?:%+)");
re2::RE2 equals_re("([^%_]*)");
string pattern_str(pattern.ptr, pattern.len);
string search_string;
if (case_sensitive && RE2::FullMatch(pattern_str, substring_re, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantSubstringFn;
} else if (case_sensitive &&
RE2::FullMatch(pattern_str, starts_with_re, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantStartsWithFn;
} else if (case_sensitive &&
RE2::FullMatch(pattern_str, ends_with_re, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantEndsWithFn;
} else if (case_sensitive &&
RE2::FullMatch(pattern_str, equals_re, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantEqualsFn;
} else {
string re_pattern;
ConvertLikePattern(context,
*reinterpret_cast<StringVal*>(context->GetConstantArg(1)), &re_pattern);
RE2::Options opts;
opts.set_never_nl(false);
opts.set_dot_nl(true);
opts.set_case_sensitive(case_sensitive);
state->regex_.reset(new RE2(re_pattern, opts));
if (!state->regex_->ok()) {
context->SetError(Substitute("Invalid regex: $0", pattern_str).c_str());
}
}
}
}
void LikePredicate::LikeClose(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
delete state;
context->SetFunctionState(FunctionContext::THREAD_LOCAL, nullptr);
}
}
void LikePredicate::RegexPrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
RegexPrepareInternal(context, scope, true);
}
void LikePredicate::IRegexPrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
RegexPrepareInternal(context, scope, false);
}
void LikePredicate::RegexPrepareInternal(FunctionContext* context,
FunctionContext::FunctionStateScope scope, bool case_sensitive) {
if (scope != FunctionContext::THREAD_LOCAL) return;
LikePredicateState* state = new LikePredicateState();
context->SetFunctionState(scope, state);
state->function_ = RegexFn;
if (context->IsArgConstant(1)) {
StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
if (pattern->is_null) return;
string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len);
string search_string;
// The following four conditionals check if the pattern is a constant string,
// starts with a constant string and is followed by any number of wildcard characters,
// ends with a constant string and is preceded by any number of wildcard characters or
// has a constant substring surrounded on both sides by any number of wildcard
// characters. In any of these conditions, we can search for the pattern more
// efficiently by using our own string match functions rather than regex matching.
if (case_sensitive && RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantEqualsFn;
} else if (case_sensitive &&
RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantStartsWithFn;
} else if (case_sensitive &&
RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantEndsWithFn;
} else if (case_sensitive &&
RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
state->SetSearchString(search_string);
state->function_ = ConstantSubstringFn;
} else {
RE2::Options opts;
opts.set_case_sensitive(case_sensitive);
state->regex_.reset(new RE2(pattern_str, opts));
if (!state->regex_->ok()) {
context->SetError(
Substitute("Invalid regex expression: '$0'", pattern_str).c_str());
}
state->function_ = ConstantRegexFnPartial;
}
}
}
// This prepare function is used only when 3 parameters are passed to the regexp_like()
// function. For the 2 parameter version, the RegexPrepare() function is used to prepare.
void LikePredicate::RegexpLikePrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) return;
LikePredicateState* state = new LikePredicateState();
context->SetFunctionState(scope, state);
// If both the pattern and the match parameter are constant, we pre-compile the
// regular expression once here. Otherwise, the RE is compiled per row in RegexpLike()
if (context->IsArgConstant(1) && context->IsArgConstant(2)) {
StringVal* pattern;
pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
if (pattern->is_null) return;
StringVal* match_parameter = reinterpret_cast<StringVal*>(context->GetConstantArg(2));
stringstream error;
if (match_parameter->is_null) {
error << "NULL match parameter";
context->SetError(error.str().c_str());
return;
}
RE2::Options opts;
string error_str;
if (!StringFunctions::SetRE2Options(*match_parameter, &error_str, &opts)) {
context->SetError(error_str.c_str());
return;
}
string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len);
state->regex_.reset(new RE2(pattern_str, opts));
if (!state->regex_->ok()) {
context->SetError(
Substitute("Invalid regex expression: '$0'", pattern_str).c_str());
}
}
}
// This is used only for the 3 parameter version of regexp_like(). The 2 parameter
// version calls Regex() directly.
BooleanVal LikePredicate::RegexpLikeInternal(FunctionContext* context,
const StringVal& val, const StringVal& pattern, const StringVal& match_parameter) {
if (val.is_null || pattern.is_null) return BooleanVal::null();
// If either the pattern or the third optional match parameter are not constant, we
// have to recompile the RE for every row.
if (!context->IsArgConstant(2) || !context->IsArgConstant(1)) {
if (match_parameter.is_null) return BooleanVal::null();
RE2::Options opts;
string error_str;
if (!StringFunctions::SetRE2Options(match_parameter, &error_str, &opts)) {
context->SetError(error_str.c_str());
return BooleanVal(false);
}
string re_pattern(reinterpret_cast<const char*>(pattern.ptr), pattern.len);
re2::RE2 re(re_pattern, opts);
if (re.ok()) {
return RE2::PartialMatch(
re2::StringPiece(reinterpret_cast<const char*>(val.ptr), val.len), re);
} else {
context->SetError(Substitute("Invalid regex: $0", re_pattern).c_str());
return BooleanVal(false);
}
}
return ConstantRegexFnPartial(context, val, pattern);
}
void LikePredicate::RegexClose(FunctionContext* context,
FunctionContext::FunctionStateScope scope) {
if (scope == FunctionContext::THREAD_LOCAL) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
delete state;
context->SetFunctionState(FunctionContext::THREAD_LOCAL, nullptr);
}
}
BooleanVal LikePredicate::RegexFn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
return RegexMatch(context, val, pattern, false);
}
BooleanVal LikePredicate::LikeFn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
return RegexMatch(context, val, pattern, true);
}
BooleanVal LikePredicate::ConstantSubstringFn(FunctionContext* context,
const StringVal& val, const StringVal& pattern) {
if (val.is_null) return BooleanVal::null();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
if (state->search_string_sv_.len == 0) return BooleanVal(true);
StringValue pattern_value = StringValue::FromStringVal(val);
return BooleanVal(state->substring_pattern_.Search(&pattern_value) != -1);
}
BooleanVal LikePredicate::ConstantStartsWithFn(FunctionContext* context,
const StringVal& val, const StringVal& pattern) {
if (val.is_null) return BooleanVal::null();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
if (val.len < state->search_string_sv_.len) {
return BooleanVal(false);
} else {
StringValue v =
StringValue(reinterpret_cast<char*>(val.ptr), state->search_string_sv_.len);
return BooleanVal(state->search_string_sv_.Eq((v)));
}
}
BooleanVal LikePredicate::ConstantEndsWithFn(FunctionContext* context,
const StringVal& val, const StringVal& pattern) {
if (val.is_null) return BooleanVal::null();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
if (val.len < state->search_string_sv_.len) {
return BooleanVal(false);
} else {
char* ptr =
reinterpret_cast<char*>(val.ptr) + val.len - state->search_string_sv_.len;
int len = state->search_string_sv_.len;
StringValue v = StringValue(ptr, len);
return BooleanVal(state->search_string_sv_.Eq(v));
}
}
BooleanVal LikePredicate::ConstantEqualsFn(FunctionContext* context, const StringVal& val,
const StringVal& pattern) {
if (val.is_null) return BooleanVal::null();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
return BooleanVal(state->search_string_sv_.Eq(StringValue::FromStringVal(val)));
}
BooleanVal LikePredicate::ConstantRegexFnPartial(FunctionContext* context,
const StringVal& val, const StringVal& pattern) {
if (val.is_null) return BooleanVal::null();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len);
return RE2::PartialMatch(operand_sp, *state->regex_);
}
BooleanVal LikePredicate::ConstantRegexFn(FunctionContext* context,
const StringVal& val, const StringVal& pattern) {
if (val.is_null) return BooleanVal::null();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len);
return RE2::FullMatch(operand_sp, *state->regex_);
}
BooleanVal LikePredicate::RegexMatch(FunctionContext* context,
const StringVal& operand_value, const StringVal& pattern_value,
bool is_like_pattern) {
if (operand_value.is_null || pattern_value.is_null) return BooleanVal::null();
if (context->IsArgConstant(1)) {
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
if (is_like_pattern) {
return RE2::FullMatch(re2::StringPiece(reinterpret_cast<const char*>(
operand_value.ptr), operand_value.len), *state->regex_.get());
} else {
return RE2::PartialMatch(re2::StringPiece(reinterpret_cast<const char*>(
operand_value.ptr), operand_value.len), *state->regex_.get());
}
} else {
string re_pattern;
RE2::Options opts;
if (is_like_pattern) {
ConvertLikePattern(context, pattern_value, &re_pattern);
opts.set_never_nl(false);
opts.set_dot_nl(true);
} else {
re_pattern =
string(reinterpret_cast<const char*>(pattern_value.ptr), pattern_value.len);
}
re2::RE2 re(re_pattern, opts);
if (re.ok()) {
if (is_like_pattern) {
return RE2::FullMatch(re2::StringPiece(
reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re);
} else {
return RE2::PartialMatch(
re2::StringPiece(
reinterpret_cast<const char*>(operand_value.ptr), operand_value.len),
re);
}
} else {
string pattern_str(
reinterpret_cast<const char*>(pattern_value.ptr), pattern_value.len);
context->SetError(Substitute("Invalid regex: $0", pattern_str).c_str());
return BooleanVal(false);
}
}
}
void LikePredicate::ConvertLikePattern(FunctionContext* context, const StringVal& pattern,
string* re_pattern) {
re_pattern->clear();
LikePredicateState* state = reinterpret_cast<LikePredicateState*>(
context->GetFunctionState(FunctionContext::THREAD_LOCAL));
bool is_escaped = false;
for (int i = 0; i < pattern.len; ++i) {
if (!is_escaped && pattern.ptr[i] == '%') {
re_pattern->append(".*");
} else if (!is_escaped && pattern.ptr[i] == '_') {
re_pattern->append(".");
// check for escape char before checking for regex special chars, they might overlap
} else if (!is_escaped && pattern.ptr[i] == state->escape_char_) {
is_escaped = true;
} else if (
pattern.ptr[i] == '.'
|| pattern.ptr[i] == '['
|| pattern.ptr[i] == ']'
|| pattern.ptr[i] == '{'
|| pattern.ptr[i] == '}'
|| pattern.ptr[i] == '('
|| pattern.ptr[i] == ')'
|| pattern.ptr[i] == '\\'
|| pattern.ptr[i] == '*'
|| pattern.ptr[i] == '+'
|| pattern.ptr[i] == '?'
|| pattern.ptr[i] == '|'
|| pattern.ptr[i] == '^'
|| pattern.ptr[i] == '$'
) {
// escape all regex special characters; see list at
// http://www.boost.org/doc/libs/1_47_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html
re_pattern->append("\\");
re_pattern->append(1, pattern.ptr[i]);
is_escaped = false;
} else {
// regular character or escaped special character
re_pattern->append(1, pattern.ptr[i]);
is_escaped = false;
}
}
}
} // namespace impala