blob: a98a4793426fd6fa0a70be916a07399dfabb5e2d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.queries;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/**
* Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. In
* effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration of
* fuzzy scoring factors. This generally produces good results for queries where users may provide
* details in a number of fields and have no knowledge of boolean query syntax and also want a
* degree of fuzzy matching and a fast query.
*
* <p>For each source term the fuzzy variants are held in a BooleanQuery with no coord factor
* (because we are not looking for matches on multiple variants in any one doc). Additionally, a
* specialized TermQuery is used for variants and does not use that variant term's IDF because this
* would favour rarer terms eg misspellings. Instead, all variants use the same IDF ranking (the one
* for the source query term) and this is factored into the variant's boost. If the source query
* term does not exist in the index the average IDF of the variants is used.
*/
public class FuzzyLikeThisQuery extends Query {
// TODO: generalize this query (at least it should not reuse this static sim!
// a better way might be to convert this into multitermquery rewrite methods.
// the rewrite method can 'average' the TermStates's term statistics (docfreq,totalTermFreq)
// provided to TermQuery, so that the general idea is agnostic to any scoring system...
static TFIDFSimilarity sim = new ClassicSimilarity();
ArrayList<FieldVals> fieldVals = new ArrayList<>();
Analyzer analyzer;
int MAX_VARIANTS_PER_TERM = 50;
boolean ignoreTF = false;
private int maxNumTerms;
@Override
public int hashCode() {
int prime = 31;
int result = classHash();
result = prime * result + Objects.hashCode(analyzer);
result = prime * result + Objects.hashCode(fieldVals);
result = prime * result + (ignoreTF ? 1231 : 1237);
result = prime * result + maxNumTerms;
return result;
}
@Override
public boolean equals(Object other) {
return sameClassAs(other) && equalsTo(getClass().cast(other));
}
private boolean equalsTo(FuzzyLikeThisQuery other) {
return Objects.equals(analyzer, other.analyzer)
&& Objects.equals(fieldVals, other.fieldVals)
&& ignoreTF == other.ignoreTF
&& maxNumTerms == other.maxNumTerms;
}
/**
* @param maxNumTerms The total number of terms clauses that will appear once rewritten as a
* BooleanQuery
*/
public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) {
this.analyzer = analyzer;
this.maxNumTerms = maxNumTerms;
}
static class FieldVals {
String queryString;
String fieldName;
int maxEdits;
int prefixLength;
public FieldVals(String name, int maxEdits, int length, String queryString) {
fieldName = name;
this.maxEdits = maxEdits;
prefixLength = length;
this.queryString = queryString;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((fieldName == null) ? 0 : fieldName.hashCode());
result = prime * result + maxEdits;
result = prime * result + prefixLength;
result = prime * result + ((queryString == null) ? 0 : queryString.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
FieldVals other = (FieldVals) obj;
if (fieldName == null) {
if (other.fieldName != null) {
return false;
}
} else if (!fieldName.equals(other.fieldName)) return false;
if (maxEdits != other.maxEdits) {
return false;
}
if (prefixLength != other.prefixLength) {
return false;
}
if (queryString == null) {
if (other.queryString != null) {
return false;
}
} else if (!queryString.equals(other.queryString)) {
return false;
}
return true;
}
}
/**
* Adds user input for "fuzzification"
*
* @param queryString The string which will be parsed by the analyzer and for which fuzzy variants
* will be parsed
* @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see
* FuzzyTermsEnum)
* @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
*/
public void addTerms(
String queryString, String fieldName, float minSimilarity, int prefixLength) {
int maxEdits = (int) minSimilarity;
if (maxEdits != minSimilarity
|| maxEdits < 0
|| maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException(
"minSimilarity must integer value between 0 and "
+ LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE
+ ", inclusive; got "
+ minSimilarity);
}
fieldVals.add(new FieldVals(fieldName, maxEdits, prefixLength, queryString));
}
private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
if (f.queryString == null) return;
final Terms terms = MultiTerms.getTerms(reader, f.fieldName);
if (terms == null) {
return;
}
try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
int corpusNumDocs = reader.numDocs();
HashSet<String> processedTerms = new HashSet<>();
ts.reset();
while (ts.incrementToken()) {
String term = termAtt.toString();
if (!processedTerms.contains(term)) {
processedTerms.add(term);
ScoreTermQueue variantsQ =
new ScoreTermQueue(
MAX_VARIANTS_PER_TERM); // maxNum variants considered for any one term
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
FuzzyTermsEnum fe =
new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
// store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
int totalVariantDocFreqs = 0;
BytesRef possibleMatch;
BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
while ((possibleMatch = fe.next()) != null) {
numVariants++;
totalVariantDocFreqs += fe.docFreq();
float score = boostAtt.getBoost();
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
ScoreTerm st =
new ScoreTerm(
new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)),
score,
startTerm);
variantsQ.insertWithOverflow(st);
minScore = variantsQ.top().score; // maintain minScore
}
fe.setMaxNonCompetitiveBoost(
variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {
int avgDf = totalVariantDocFreqs / numVariants;
if (df == 0) // no direct match we can use as df for all variants
{
df = avgDf; // use avg df of all variants
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking
// overall top query terms
int size = variantsQ.size();
for (int i = 0; i < size; i++) {
ScoreTerm st = variantsQ.pop();
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
q.insertWithOverflow(st);
}
}
}
}
ts.end();
}
}
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
if (ignoreTF) {
return new ConstantScoreQuery(new TermQuery(term));
} else {
// we build an artificial TermStates that will give an overall df and ttf
// equal to 1
TermStates context = new TermStates(reader.getContext());
for (LeafReaderContext leafContext : reader.leaves()) {
Terms terms = leafContext.reader().terms(term.field());
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
context.register(termsEnum.termState(), leafContext.ord, freq, freq);
}
}
}
return new TermQuery(term, context);
}
}
@Override
public void visit(QueryVisitor visitor) {
visitor.visitLeaf(this);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
ScoreTermQueue q = new ScoreTermQueue(maxNumTerms);
// load up the list of possible terms
for (FieldVals f : fieldVals) {
addTerms(reader, f, q);
}
BooleanQuery.Builder bq = new BooleanQuery.Builder();
// create BooleanQueries to hold the variants for each token/field pair and ensure it
// has no coord factor
// Step 1: sort the termqueries by term/field
HashMap<Term, ArrayList<ScoreTerm>> variantQueries = new HashMap<>();
int size = q.size();
for (int i = 0; i < size; i++) {
ScoreTerm st = q.pop();
ArrayList<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm);
if (l == null) {
l = new ArrayList<>();
variantQueries.put(st.fuzziedSourceTerm, l);
}
l.add(st);
}
// Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator();
iter.hasNext(); ) {
ArrayList<ScoreTerm> variants = iter.next();
if (variants.size() == 1) {
// optimize where only one selected variant
ScoreTerm st = variants.get(0);
Query tq = newTermQuery(reader, st.term);
// set the boost to a mix of IDF and score
bq.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD);
} else {
BooleanQuery.Builder termVariants = new BooleanQuery.Builder();
for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2.hasNext(); ) {
ScoreTerm st = iterator2.next();
// found a match
Query tq = newTermQuery(reader, st.term);
// set the boost using the ScoreTerm's score
termVariants.add(
new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD); // add to query
}
bq.add(termVariants.build(), BooleanClause.Occur.SHOULD); // add to query
}
}
// TODO possible alternative step 3 - organize above booleans into a new layer of field-based
// booleans with a minimum-should-match of NumFields-1?
return bq.build();
}
// Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
// term variants) then is reset with IDF for use in ranking against all other
// terms/fields
private static class ScoreTerm {
public Term term;
public float score;
Term fuzziedSourceTerm;
public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) {
this.term = term;
this.score = score;
this.fuzziedSourceTerm = fuzziedSourceTerm;
}
}
private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
public ScoreTermQueue(int size) {
super(size);
}
/* (non-Javadoc)
* @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
*/
@Override
protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
if (termA.score == termB.score) {
return termA.term.compareTo(termB.term) > 0;
} else return termA.score < termB.score;
}
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
@Override
public String toString(String field) {
return null;
}
public boolean isIgnoreTF() {
return ignoreTF;
}
public void setIgnoreTF(boolean ignoreTF) {
this.ignoreTF = ignoreTF;
}
}