| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.sandbox.queries; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Objects; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.MultiTerms; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermStates; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.BoostAttribute; |
| import org.apache.lucene.search.BoostQuery; |
| import org.apache.lucene.search.ConstantScoreQuery; |
| import org.apache.lucene.search.FuzzyTermsEnum; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.QueryVisitor; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.similarities.ClassicSimilarity; |
| import org.apache.lucene.search.similarities.TFIDFSimilarity; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.PriorityQueue; |
| import org.apache.lucene.util.automaton.LevenshteinAutomata; |
| |
| /** |
| * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. |
| * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration |
| * of fuzzy scoring factors. |
| * This generally produces good results for queries where users may provide details in a number of |
| * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and |
| * a fast query. |
| * |
| * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because |
| * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized |
| * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer |
| * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query |
| * term) and this is factored into the variant's boost. If the source query term does not exist in the |
| * index the average IDF of the variants is used. |
| */ |
| public class FuzzyLikeThisQuery extends Query |
| { |
| // TODO: generalize this query (at least it should not reuse this static sim! |
| // a better way might be to convert this into multitermquery rewrite methods. |
| // the rewrite method can 'average' the TermStates's term statistics (docfreq,totalTermFreq) |
| // provided to TermQuery, so that the general idea is agnostic to any scoring system... |
| static TFIDFSimilarity sim=new ClassicSimilarity(); |
| ArrayList<FieldVals> fieldVals=new ArrayList<>(); |
| Analyzer analyzer; |
| |
| int MAX_VARIANTS_PER_TERM=50; |
| boolean ignoreTF=false; |
| private int maxNumTerms; |
| |
| @Override |
| public int hashCode() { |
| int prime = 31; |
| int result = classHash(); |
| result = prime * result + Objects.hashCode(analyzer); |
| result = prime * result + Objects.hashCode(fieldVals); |
| result = prime * result + (ignoreTF ? 1231 : 1237); |
| result = prime * result + maxNumTerms; |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| return sameClassAs(other) && |
| equalsTo(getClass().cast(other)); |
| } |
| |
| private boolean equalsTo(FuzzyLikeThisQuery other) { |
| return Objects.equals(analyzer, other.analyzer) && |
| Objects.equals(fieldVals, other.fieldVals) && |
| ignoreTF == other.ignoreTF && |
| maxNumTerms == other.maxNumTerms; |
| } |
| |
| /** |
| * |
| * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery |
| */ |
| public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) |
| { |
| this.analyzer=analyzer; |
| this.maxNumTerms = maxNumTerms; |
| } |
| |
| static class FieldVals |
| { |
| String queryString; |
| String fieldName; |
| int maxEdits; |
| int prefixLength; |
| public FieldVals(String name, int maxEdits, int length, String queryString) |
| { |
| fieldName = name; |
| this.maxEdits = maxEdits; |
| prefixLength = length; |
| this.queryString = queryString; |
| } |
| |
| @Override |
| public int hashCode() { |
| final int prime = 31; |
| int result = 1; |
| result = prime * result |
| + ((fieldName == null) ? 0 : fieldName.hashCode()); |
| result = prime * result + maxEdits; |
| result = prime * result + prefixLength; |
| result = prime * result |
| + ((queryString == null) ? 0 : queryString.hashCode()); |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) |
| return true; |
| if (obj == null) |
| return false; |
| if (getClass() != obj.getClass()) |
| return false; |
| FieldVals other = (FieldVals) obj; |
| if (fieldName == null) { |
| if (other.fieldName != null) |
| return false; |
| } else if (!fieldName.equals(other.fieldName)) |
| return false; |
| if (maxEdits != other.maxEdits) { |
| return false; |
| } |
| if (prefixLength != other.prefixLength) |
| return false; |
| if (queryString == null) { |
| if (other.queryString != null) |
| return false; |
| } else if (!queryString.equals(other.queryString)) |
| return false; |
| return true; |
| } |
| |
| |
| |
| } |
| |
| /** |
| * Adds user input for "fuzzification" |
| * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed |
| * @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum) |
| * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum) |
| */ |
| public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) |
| { |
| int maxEdits = (int) minSimilarity; |
| if (maxEdits != minSimilarity || maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { |
| throw new IllegalArgumentException("minSimilarity must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + minSimilarity); |
| } |
| fieldVals.add(new FieldVals(fieldName,maxEdits,prefixLength,queryString)); |
| } |
| |
| |
| private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException { |
| if (f.queryString == null) return; |
| final Terms terms = MultiTerms.getTerms(reader, f.fieldName); |
| if (terms == null) { |
| return; |
| } |
| try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) { |
| CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| |
| int corpusNumDocs = reader.numDocs(); |
| HashSet<String> processedTerms = new HashSet<>(); |
| ts.reset(); |
| while (ts.incrementToken()) { |
| String term = termAtt.toString(); |
| if (!processedTerms.contains(term)) { |
| processedTerms.add(term); |
| ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term |
| float minScore = 0; |
| Term startTerm = new Term(f.fieldName, term); |
| FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true); |
| //store the df so all variants use same idf |
| int df = reader.docFreq(startTerm); |
| int numVariants = 0; |
| int totalVariantDocFreqs = 0; |
| BytesRef possibleMatch; |
| BoostAttribute boostAtt = |
| fe.attributes().addAttribute(BoostAttribute.class); |
| while ((possibleMatch = fe.next()) != null) { |
| numVariants++; |
| totalVariantDocFreqs += fe.docFreq(); |
| float score = boostAtt.getBoost(); |
| if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) { |
| ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm); |
| variantsQ.insertWithOverflow(st); |
| minScore = variantsQ.top().score; // maintain minScore |
| } |
| fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); |
| } |
| |
| if (numVariants > 0) { |
| int avgDf = totalVariantDocFreqs / numVariants; |
| if (df == 0)//no direct match we can use as df for all variants |
| { |
| df = avgDf; //use avg df of all variants |
| } |
| |
| // take the top variants (scored by edit distance) and reset the score |
| // to include an IDF factor then add to the global queue for ranking |
| // overall top query terms |
| int size = variantsQ.size(); |
| for (int i = 0; i < size; i++) { |
| ScoreTerm st = variantsQ.pop(); |
| st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs); |
| q.insertWithOverflow(st); |
| } |
| } |
| } |
| } |
| ts.end(); |
| } |
| } |
| |
| private Query newTermQuery(IndexReader reader, Term term) throws IOException { |
| if (ignoreTF) { |
| return new ConstantScoreQuery(new TermQuery(term)); |
| } else { |
| // we build an artificial TermStates that will give an overall df and ttf |
| // equal to 1 |
| TermStates context = new TermStates(reader.getContext()); |
| for (LeafReaderContext leafContext : reader.leaves()) { |
| Terms terms = leafContext.reader().terms(term.field()); |
| if (terms != null) { |
| TermsEnum termsEnum = terms.iterator(); |
| if (termsEnum.seekExact(term.bytes())) { |
| int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 |
| context.register(termsEnum.termState(), leafContext.ord, freq, freq); |
| } |
| } |
| } |
| return new TermQuery(term, context); |
| } |
| } |
| |
| @Override |
| public void visit(QueryVisitor visitor) { |
| visitor.visitLeaf(this); |
| } |
| |
| @Override |
| public Query rewrite(IndexReader reader) throws IOException |
| { |
| ScoreTermQueue q = new ScoreTermQueue(maxNumTerms); |
| //load up the list of possible terms |
| for (FieldVals f : fieldVals) { |
| addTerms(reader, f, q); |
| } |
| |
| BooleanQuery.Builder bq = new BooleanQuery.Builder(); |
| |
| //create BooleanQueries to hold the variants for each token/field pair and ensure it |
| // has no coord factor |
| //Step 1: sort the termqueries by term/field |
| HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<>(); |
| int size = q.size(); |
| for(int i = 0; i < size; i++) |
| { |
| ScoreTerm st = q.pop(); |
| ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm); |
| if(l==null) |
| { |
| l=new ArrayList<>(); |
| variantQueries.put(st.fuzziedSourceTerm,l); |
| } |
| l.add(st); |
| } |
| //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries |
| for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();) |
| { |
| ArrayList<ScoreTerm> variants = iter.next(); |
| if(variants.size()==1) |
| { |
| //optimize where only one selected variant |
| ScoreTerm st= variants.get(0); |
| Query tq = newTermQuery(reader, st.term); |
| // set the boost to a mix of IDF and score |
| bq.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD); |
| } |
| else |
| { |
| BooleanQuery.Builder termVariants=new BooleanQuery.Builder(); |
| for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2 |
| .hasNext();) |
| { |
| ScoreTerm st = iterator2.next(); |
| // found a match |
| Query tq = newTermQuery(reader, st.term); |
| // set the boost using the ScoreTerm's score |
| termVariants.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD); // add to query |
| } |
| bq.add(termVariants.build(), BooleanClause.Occur.SHOULD); // add to query |
| } |
| } |
| //TODO possible alternative step 3 - organize above booleans into a new layer of field-based |
| // booleans with a minimum-should-match of NumFields-1? |
| return bq.build(); |
| } |
| |
| //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best |
| // term variants) then is reset with IDF for use in ranking against all other |
| // terms/fields |
| private static class ScoreTerm{ |
| public Term term; |
| public float score; |
| Term fuzziedSourceTerm; |
| |
| public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){ |
| this.term = term; |
| this.score = score; |
| this.fuzziedSourceTerm=fuzziedSourceTerm; |
| } |
| } |
| |
| private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> { |
| public ScoreTermQueue(int size){ |
| super(size); |
| } |
| |
| /* (non-Javadoc) |
| * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object) |
| */ |
| @Override |
| protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) { |
| if (termA.score== termB.score) |
| return termA.term.compareTo(termB.term) > 0; |
| else |
| return termA.score < termB.score; |
| } |
| |
| } |
| |
| /* (non-Javadoc) |
| * @see org.apache.lucene.search.Query#toString(java.lang.String) |
| */ |
| @Override |
| public String toString(String field) |
| { |
| return null; |
| } |
| |
| |
| public boolean isIgnoreTF() |
| { |
| return ignoreTF; |
| } |
| |
| |
| public void setIgnoreTF(boolean ignoreTF) |
| { |
| this.ignoreTF = ignoreTF; |
| } |
| |
| } |