blob: be759f2009c238ab987c74b681ced8161c415416 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.query.filter;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.RangeSet;
import com.google.common.io.BaseEncoding;
import com.google.common.primitives.Chars;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.extraction.ExtractionFn;
import org.apache.druid.segment.filter.LikeFilter;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LikeDimFilter extends AbstractOptimizableDimFilter implements DimFilter
{
// Regex matching characters that are definitely okay to include unescaped in a regex.
// Leads to excessively paranoid escaping, although shouldn't affect runtime beyond compiling the regex.
private static final Pattern DEFINITELY_FINE = Pattern.compile("[\\w\\d\\s-]");
private final String dimension;
private final String pattern;
@Nullable
private final Character escapeChar;
@Nullable
private final ExtractionFn extractionFn;
@Nullable
private final FilterTuning filterTuning;
private final LikeMatcher likeMatcher;
@JsonCreator
public LikeDimFilter(
@JsonProperty("dimension") final String dimension,
@JsonProperty("pattern") final String pattern,
@JsonProperty("escape") @Nullable final String escape,
@JsonProperty("extractionFn") @Nullable final ExtractionFn extractionFn,
@JsonProperty("filterTuning") @Nullable final FilterTuning filterTuning
)
{
this.dimension = Preconditions.checkNotNull(dimension, "dimension");
this.pattern = Preconditions.checkNotNull(pattern, "pattern");
this.extractionFn = extractionFn;
this.filterTuning = filterTuning;
if (escape != null && escape.length() != 1) {
throw new IllegalArgumentException("Escape must be null or a single character");
} else {
this.escapeChar = escape == null ? null : escape.charAt(0);
}
this.likeMatcher = LikeMatcher.from(pattern, this.escapeChar);
}
@VisibleForTesting
public LikeDimFilter(
final String dimension,
final String pattern,
@Nullable final String escape,
@Nullable final ExtractionFn extractionFn
)
{
this(dimension, pattern, escape, extractionFn, null);
}
@JsonProperty
public String getDimension()
{
return dimension;
}
@JsonProperty
public String getPattern()
{
return pattern;
}
@Nullable
@JsonProperty
public String getEscape()
{
return escapeChar != null ? escapeChar.toString() : null;
}
@Nullable
@JsonProperty
public ExtractionFn getExtractionFn()
{
return extractionFn;
}
@Nullable
@JsonInclude(JsonInclude.Include.NON_NULL)
@JsonProperty
public FilterTuning getFilterTuning()
{
return filterTuning;
}
@Override
public byte[] getCacheKey()
{
final byte[] dimensionBytes = StringUtils.toUtf8(dimension);
final byte[] patternBytes = StringUtils.toUtf8(pattern);
final byte[] escapeBytes = escapeChar == null ? new byte[0] : Chars.toByteArray(escapeChar);
final byte[] extractionFnBytes = extractionFn == null ? new byte[0] : extractionFn.getCacheKey();
final int sz = 4 + dimensionBytes.length + patternBytes.length + escapeBytes.length + extractionFnBytes.length;
return ByteBuffer.allocate(sz)
.put(DimFilterUtils.LIKE_CACHE_ID)
.put(dimensionBytes)
.put(DimFilterUtils.STRING_SEPARATOR)
.put(patternBytes)
.put(DimFilterUtils.STRING_SEPARATOR)
.put(escapeBytes)
.put(DimFilterUtils.STRING_SEPARATOR)
.put(extractionFnBytes)
.array();
}
@Override
public Filter toFilter()
{
return new LikeFilter(dimension, extractionFn, likeMatcher, filterTuning);
}
@Override
public RangeSet<String> getDimensionRangeSet(String dimension)
{
return null;
}
@Override
public Set<String> getRequiredColumns()
{
return ImmutableSet.of(dimension);
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
LikeDimFilter that = (LikeDimFilter) o;
return dimension.equals(that.dimension) &&
pattern.equals(that.pattern) &&
Objects.equals(escapeChar, that.escapeChar) &&
Objects.equals(extractionFn, that.extractionFn) &&
Objects.equals(filterTuning, that.filterTuning);
}
@Override
public int hashCode()
{
return Objects.hash(dimension, pattern, escapeChar, extractionFn, filterTuning);
}
@Override
public String toString()
{
final DimFilterToStringBuilder builder = new DimFilterToStringBuilder();
builder.appendDimension(dimension, extractionFn).append(" LIKE '").append(pattern).append("'");
if (escapeChar != null) {
builder.append(" ESCAPE '").append(escapeChar).append("'");
}
return builder.appendFilterTuning(filterTuning).build();
}
public static class LikeMatcher
{
public enum SuffixMatch
{
MATCH_ANY,
MATCH_EMPTY,
MATCH_PATTERN
}
// Strings match if:
// (a) suffixMatch is MATCH_ANY and they start with "prefix"
// (b) suffixMatch is MATCH_EMPTY and they start with "prefix" and contain nothing after prefix
// (c) suffixMatch is MATCH_PATTERN and the string matches "pattern"
private final SuffixMatch suffixMatch;
// Prefix that matching strings are known to start with. May be empty.
private final String prefix;
// Regex patterns that describes matching strings.
private final List<Pattern> pattern;
private final String likePattern;
private LikeMatcher(
final String likePattern,
final SuffixMatch suffixMatch,
final String prefix,
final List<Pattern> pattern
)
{
this.likePattern = likePattern;
this.suffixMatch = Preconditions.checkNotNull(suffixMatch, "suffixMatch");
this.prefix = NullHandling.nullToEmptyIfNeeded(prefix);
this.pattern = Preconditions.checkNotNull(pattern, "pattern");
}
public static LikeMatcher from(
final String likePattern,
@Nullable final Character escapeChar
)
{
final StringBuilder prefix = new StringBuilder();
// Splits the input on % to leave only eagerly-matchable sub-patterns. This is to avoid catastrophic backtracking:
// https://www.rexegg.com/regex-explosive-quantifiers.html#remote
final List<Pattern> pattern = new ArrayList<>();
final StringBuilder regex = new StringBuilder("^");
boolean escaping = false;
boolean inPrefix = true;
SuffixMatch suffixMatch = SuffixMatch.MATCH_EMPTY;
for (int i = 0; i < likePattern.length(); i++) {
final char c = likePattern.charAt(i);
if (escapeChar != null && c == escapeChar && !escaping) {
escaping = true;
} else if (c == '%' && !escaping) {
inPrefix = false;
if (suffixMatch == SuffixMatch.MATCH_EMPTY) {
suffixMatch = SuffixMatch.MATCH_ANY;
}
if (regex.length() > 0) {
if (regex.length() > 1 || regex.charAt(0) != '^') {
pattern.add(Pattern.compile(regex.toString(), Pattern.DOTALL));
}
regex.setLength(0);
}
} else if (c == '_' && !escaping) {
inPrefix = false;
suffixMatch = SuffixMatch.MATCH_PATTERN;
regex.append('.');
} else {
if (inPrefix) {
prefix.append(c);
} else {
suffixMatch = SuffixMatch.MATCH_PATTERN;
}
addPatternCharacter(regex, c);
escaping = false;
}
}
if (likePattern.isEmpty()) {
pattern.add(Pattern.compile("^$"));
} else if (regex.length() > 0) {
regex.append('$');
pattern.add(Pattern.compile(regex.toString(), Pattern.DOTALL));
}
return new LikeMatcher(likePattern, suffixMatch, prefix.toString(), pattern);
}
private static void addPatternCharacter(final StringBuilder patternBuilder, final char c)
{
if (DEFINITELY_FINE.matcher(String.valueOf(c)).matches()) {
patternBuilder.append(c);
} else {
patternBuilder.append("\\u").append(BaseEncoding.base16().encode(Chars.toByteArray(c)));
}
}
public DruidPredicateMatch matches(@Nullable final String s)
{
return matches(s, pattern);
}
private static DruidPredicateMatch matches(@Nullable final String s, List<Pattern> pattern)
{
String val = NullHandling.nullToEmptyIfNeeded(s);
if (val == null) {
return DruidPredicateMatch.UNKNOWN;
}
if (pattern.size() == 1) {
// Most common case is a single pattern: a% => ^a, %z => z$, %m% => m
return DruidPredicateMatch.of(pattern.get(0).matcher(val).find());
}
int offset = 0;
for (Pattern part : pattern) {
Matcher matcher = part.matcher(val);
if (!matcher.find(offset)) {
return DruidPredicateMatch.FALSE;
}
offset = matcher.end();
}
return DruidPredicateMatch.TRUE;
}
/**
* Checks if the suffix of "value" matches the suffix of this matcher. The first prefix.length() characters
* of "value" are ignored. This method is useful if you've already independently verified the prefix.
*/
public DruidPredicateMatch matchesSuffixOnly(@Nullable String value)
{
if (suffixMatch == SuffixMatch.MATCH_ANY) {
return DruidPredicateMatch.TRUE;
} else if (suffixMatch == SuffixMatch.MATCH_EMPTY) {
return value == null ? matches(null) : DruidPredicateMatch.of(value.length() == prefix.length());
} else {
// suffixMatch is MATCH_PATTERN
return matches(value);
}
}
public DruidPredicateFactory predicateFactory(final ExtractionFn extractionFn)
{
return new PatternDruidPredicateFactory(extractionFn, pattern);
}
public String getPrefix()
{
return prefix;
}
public SuffixMatch getSuffixMatch()
{
return suffixMatch;
}
@VisibleForTesting
String describeCompilation()
{
return likePattern + " => " + prefix + ":" + pattern;
}
@VisibleForTesting
static class PatternDruidPredicateFactory implements DruidPredicateFactory
{
private final ExtractionFn extractionFn;
private final List<Pattern> pattern;
PatternDruidPredicateFactory(ExtractionFn extractionFn, List<Pattern> pattern)
{
this.extractionFn = extractionFn;
this.pattern = pattern;
}
@Override
public DruidObjectPredicate<String> makeStringPredicate()
{
if (extractionFn != null) {
return input -> matches(extractionFn.apply(input), pattern);
} else {
return input -> matches(input, pattern);
}
}
@Override
public DruidLongPredicate makeLongPredicate()
{
if (extractionFn != null) {
return input -> matches(extractionFn.apply(input), pattern);
} else {
return input -> matches(String.valueOf(input), pattern);
}
}
@Override
public DruidFloatPredicate makeFloatPredicate()
{
if (extractionFn != null) {
return input -> matches(extractionFn.apply(input), pattern);
} else {
return input -> matches(String.valueOf(input), pattern);
}
}
@Override
public DruidDoublePredicate makeDoublePredicate()
{
if (extractionFn != null) {
return input -> matches(extractionFn.apply(input), pattern);
} else {
return input -> matches(String.valueOf(input), pattern);
}
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
PatternDruidPredicateFactory that = (PatternDruidPredicateFactory) o;
return Objects.equals(extractionFn, that.extractionFn) &&
Objects.equals(pattern.toString(), that.pattern.toString());
}
@Override
public int hashCode()
{
return Objects.hash(extractionFn, pattern.toString());
}
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
LikeMatcher that = (LikeMatcher) o;
return getSuffixMatch() == that.getSuffixMatch() &&
Objects.equals(getPrefix(), that.getPrefix()) &&
Objects.equals(pattern.toString(), that.pattern.toString());
}
@Override
public int hashCode()
{
return Objects.hash(getSuffixMatch(), getPrefix(), pattern.toString());
}
@Override
public String toString()
{
return likePattern;
}
}
}