blob: cdc436d933717af9c58f63700acf61054c53ec98 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.monitor;
import java.util.Collections;
import java.util.Set;
import java.util.function.BiConsumer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.util.BytesRef;
/**
* A query handler implementation that matches Regexp queries by indexing regex
* terms by their longest static substring, and generates ngrams from Document
* tokens to match them.
* <p>
* This implementation will filter out more wildcard queries than TermFilteredPresearcher,
* at the expense of longer document build times. Which one is more performant will depend
* on the type and number of queries registered in the Monitor, and the size of documents
* to be monitored. Profiling is recommended.
*/
public class RegexpQueryHandler implements CustomQueryHandler {
/**
* The default suffix with which to mark ngrams
*/
public static final String DEFAULT_NGRAM_SUFFIX = "XX";
/**
* The default maximum length of an input token before ANYTOKENS are generated
*/
public static final int DEFAULT_MAX_TOKEN_SIZE = 30;
/**
* The default token to emit if a term is longer than MAX_TOKEN_SIZE
*/
public static final String DEFAULT_WILDCARD_TOKEN = "__WILDCARD__";
private final String ngramSuffix;
private final String wildcardToken;
private final BytesRef wildcardTokenBytes;
private final int maxTokenSize;
private final Set<String> excludedFields;
/**
* Creates a new RegexpQueryHandler
*
* @param ngramSuffix the suffix with which to mark ngrams
* @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
* @param wildcardToken the token to emit if a token is longer than maxTokenSize in length
* @param excludedFields a Set of fields to ignore when generating ngrams
*/
public RegexpQueryHandler(String ngramSuffix, int maxTokenSize, String wildcardToken, Set<String> excludedFields) {
this.ngramSuffix = ngramSuffix;
this.maxTokenSize = maxTokenSize;
this.wildcardTokenBytes = new BytesRef(wildcardToken);
this.wildcardToken = wildcardToken;
this.excludedFields = excludedFields == null ? Collections.emptySet() : excludedFields;
}
/**
* Creates a new RegexpQueryHandler using default settings
*/
public RegexpQueryHandler() {
this(DEFAULT_NGRAM_SUFFIX, DEFAULT_MAX_TOKEN_SIZE, DEFAULT_WILDCARD_TOKEN, null);
}
/**
* Creates a new RegexpQueryHandler with a maximum token size
*
* @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
*/
public RegexpQueryHandler(int maxTokenSize) {
this(DEFAULT_NGRAM_SUFFIX, maxTokenSize, DEFAULT_WILDCARD_TOKEN, null);
}
@Override
public TokenStream wrapTermStream(String field, TokenStream ts) {
if (excludedFields.contains(field))
return ts;
return new SuffixingNGramTokenFilter(ts, ngramSuffix, wildcardToken, maxTokenSize);
}
@Override
public QueryTree handleQuery(Query q, TermWeightor termWeightor) {
if (q instanceof RegexpQuery == false) {
return null;
}
RegexpQuery query = (RegexpQuery) q;
String regexp = parseOutRegexp(query.toString(""));
String selected = selectLongestSubstring(regexp);
Term term = new Term(query.getField(), selected + ngramSuffix);
double weight = termWeightor.applyAsDouble(term);
return new QueryTree() {
@Override
public double weight() {
return weight;
}
@Override
public void collectTerms(BiConsumer<String, BytesRef> termCollector) {
termCollector.accept(term.field(), term.bytes());
termCollector.accept(term.field(), wildcardTokenBytes);
}
@Override
public boolean advancePhase(double minWeight) {
return false;
}
@Override
public String toString(int depth) {
return space(depth) + "WILDCARD_NGRAM[" + term.toString() + "]^" + weight;
}
};
}
private static String parseOutRegexp(String rep) {
int fieldSepPos = rep.indexOf(":");
int firstSlash = rep.indexOf("/", fieldSepPos);
int lastSlash = rep.lastIndexOf("/");
return rep.substring(firstSlash + 1, lastSlash);
}
private static String selectLongestSubstring(String regexp) {
String selected = "";
for (String substr : regexp.split("\\.|\\*|.\\?")) {
if (substr.length() > selected.length()) {
selected = substr;
}
}
return selected;
}
}