| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Index; |
| using Lucene.Net.Search; |
| using Lucene.Net.Search.Similarities; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.Runtime.CompilerServices; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Sandbox.Queries |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. |
| /// In effect this mixes the behaviour of <see cref="FuzzyQuery"/> and MoreLikeThis but with special consideration |
| /// of fuzzy scoring factors. |
| /// This generally produces good results for queries where users may provide details in a number of |
| /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and |
| /// a fast query. |
| /// <para/> |
| /// For each source term the fuzzy variants are held in a <see cref="BooleanQuery"/> with no coord factor (because |
| /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized |
| /// <see cref="TermQuery"/> is used for variants and does not use that variant term's IDF because this would favour rarer |
| /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query |
| /// term) and this is factored into the variant's boost. If the source query term does not exist in the |
| /// index the average IDF of the variants is used. |
| /// </summary> |
| public class FuzzyLikeThisQuery : Query |
| { |
| // TODO: generalize this query (at least it should not reuse this static sim! |
| // a better way might be to convert this into multitermquery rewrite methods. |
| // the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq) |
| // provided to TermQuery, so that the general idea is agnostic to any scoring system... |
| internal static TFIDFSimilarity sim = new DefaultSimilarity(); |
| private Query rewrittenQuery = null; |
| private readonly IList<FieldVals> fieldVals = new JCG.List<FieldVals>(); |
| private readonly Analyzer analyzer; |
| |
| private readonly ScoreTermQueue q; |
| private const int MAX_VARIANTS_PER_TERM = 50; |
| private bool ignoreTF = false; |
| private readonly int maxNumTerms; |
| |
| public override int GetHashCode() |
| { |
| int prime = 31; |
| int result = base.GetHashCode(); |
| result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode()); |
| result = prime * result |
| + ((fieldVals == null) ? 0 : fieldVals.GetHashCode()); |
| result = prime * result + (ignoreTF ? 1231 : 1237); |
| result = prime * result + maxNumTerms; |
| return result; |
| } |
| |
| public override bool Equals(object obj) |
| { |
| if (this == obj) |
| return true; |
| if (obj == null) |
| return false; |
| if (GetType() != obj.GetType()) |
| return false; |
| if (!base.Equals(obj)) |
| { |
| return false; |
| } |
| FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj; |
| if (analyzer == null) |
| { |
| if (other.analyzer != null) |
| return false; |
| } |
| else if (!analyzer.Equals(other.analyzer)) |
| return false; |
| if (fieldVals == null) |
| { |
| if (other.fieldVals != null) |
| return false; |
| } |
| else if (!fieldVals.Equals(other.fieldVals)) |
| return false; |
| if (ignoreTF != other.ignoreTF) |
| return false; |
| if (maxNumTerms != other.maxNumTerms) |
| return false; |
| return true; |
| } |
| |
| /// <summary> |
| /// |
| /// </summary> |
| /// <param name="maxNumTerms">The total number of terms clauses that will appear once rewritten as a <see cref="BooleanQuery"/></param> |
| /// <param name="analyzer"></param> |
| public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) |
| { |
| q = new ScoreTermQueue(maxNumTerms); |
| this.analyzer = analyzer; |
| this.maxNumTerms = maxNumTerms; |
| } |
| |
| internal class FieldVals |
| { |
| internal string queryString; |
| internal string fieldName; |
| internal float minSimilarity; |
| internal int prefixLength; |
| public FieldVals(string name, float similarity, int length, string queryString) |
| { |
| fieldName = name; |
| minSimilarity = similarity; |
| prefixLength = length; |
| this.queryString = queryString; |
| } |
| |
| public override int GetHashCode() |
| { |
| int prime = 31; |
| int result = 1; |
| result = prime * result |
| + ((fieldName == null) ? 0 : fieldName.GetHashCode()); |
| result = prime * result + J2N.BitConversion.SingleToInt32Bits(minSimilarity); |
| result = prime * result + prefixLength; |
| result = prime * result |
| + ((queryString == null) ? 0 : queryString.GetHashCode()); |
| return result; |
| } |
| |
| public override bool Equals(object obj) |
| { |
| if (this == obj) |
| return true; |
| if (obj == null) |
| return false; |
| if (GetType() != obj.GetType()) |
| return false; |
| FieldVals other = (FieldVals)obj; |
| if (fieldName == null) |
| { |
| if (other.fieldName != null) |
| return false; |
| } |
| else if (!fieldName.Equals(other.fieldName, StringComparison.Ordinal)) |
| return false; |
| if (J2N.BitConversion.SingleToInt32Bits(minSimilarity) != J2N.BitConversion |
| .SingleToInt32Bits(other.minSimilarity)) |
| return false; |
| if (prefixLength != other.prefixLength) |
| return false; |
| if (queryString == null) |
| { |
| if (other.queryString != null) |
| return false; |
| } |
| else if (!queryString.Equals(other.queryString, StringComparison.Ordinal)) |
| return false; |
| return true; |
| } |
| } |
| |
| /// <summary> |
| /// Adds user input for "fuzzification" |
| /// </summary> |
| /// <param name="queryString">The string which will be parsed by the analyzer and for which fuzzy variants will be parsed</param> |
| /// <param name="fieldName">The minimum similarity of the term variants (see <see cref="FuzzyTermsEnum"/>)</param> |
| /// <param name="minSimilarity">Length of required common prefix on variant terms (see <see cref="FuzzyTermsEnum"/>)</param> |
| /// <param name="prefixLength"></param> |
| public virtual void AddTerms(string queryString, string fieldName, float minSimilarity, int prefixLength) |
| { |
| fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString)); |
| } |
| |
| |
| private void AddTerms(IndexReader reader, FieldVals f) |
| { |
| if (f.queryString == null) return; |
| Terms terms = MultiFields.GetTerms(reader, f.fieldName); |
| if (terms == null) |
| { |
| return; |
| } |
| TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString); |
| try |
| { |
| ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>(); |
| |
| int corpusNumDocs = reader.NumDocs; |
| ISet<string> processedTerms = new JCG.HashSet<string>(); |
| ts.Reset(); |
| while (ts.IncrementToken()) |
| { |
| string term = termAtt.ToString(); |
| if (!processedTerms.Contains(term)) |
| { |
| processedTerms.Add(term); |
| ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term |
| float minScore = 0; |
| Term startTerm = new Term(f.fieldName, term); |
| AttributeSource atts = new AttributeSource(); |
| IMaxNonCompetitiveBoostAttribute maxBoostAtt = |
| atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>(); |
| #pragma warning disable 612, 618 |
| SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength); |
| #pragma warning restore 612, 618 |
| //store the df so all variants use same idf |
| int df = reader.DocFreq(startTerm); |
| int numVariants = 0; |
| int totalVariantDocFreqs = 0; |
| BytesRef possibleMatch; |
| IBoostAttribute boostAtt = |
| fe.Attributes.AddAttribute<IBoostAttribute>(); |
| while (fe.MoveNext()) |
| { |
| possibleMatch = fe.Term; |
| numVariants++; |
| totalVariantDocFreqs += fe.DocFreq; |
| float score = boostAtt.Boost; |
| if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore) |
| { |
| ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm); |
| variantsQ.InsertWithOverflow(st); |
| minScore = variantsQ.Top.Score; // maintain minScore |
| } |
| maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity; |
| } |
| |
| if (numVariants > 0) |
| { |
| int avgDf = totalVariantDocFreqs / numVariants; |
| if (df == 0)//no direct match we can use as df for all variants |
| { |
| df = avgDf; //use avg df of all variants |
| } |
| |
| // take the top variants (scored by edit distance) and reset the score |
| // to include an IDF factor then add to the global queue for ranking |
| // overall top query terms |
| int size = variantsQ.Count; |
| for (int i = 0; i < size; i++) |
| { |
| ScoreTerm st = variantsQ.Pop(); |
| st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); |
| q.InsertWithOverflow(st); |
| } |
| } |
| } |
| } |
| ts.End(); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(ts); |
| } |
| } |
| |
| public override Query Rewrite(IndexReader reader) |
| { |
| if (rewrittenQuery != null) |
| { |
| return rewrittenQuery; |
| } |
| //load up the list of possible terms |
| foreach (var f in fieldVals) |
| { |
| AddTerms(reader, f); |
| } |
| //clear the list of fields |
| fieldVals.Clear(); |
| |
| BooleanQuery bq = new BooleanQuery(); |
| |
| |
| //create BooleanQueries to hold the variants for each token/field pair and ensure it |
| // has no coord factor |
| //Step 1: sort the termqueries by term/field |
| IDictionary<Term, List<ScoreTerm>> variantQueries = new Dictionary<Term, List<ScoreTerm>>(); |
| int size = q.Count; |
| for (int i = 0; i < size; i++) |
| { |
| ScoreTerm st = q.Pop(); |
| //List<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm); |
| // if(l==null) |
| List<ScoreTerm> l; |
| if (!variantQueries.TryGetValue(st.FuzziedSourceTerm, out l) || l == null) |
| { |
| l = new List<ScoreTerm>(); |
| variantQueries[st.FuzziedSourceTerm] = l; |
| } |
| l.Add(st); |
| } |
| //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries |
| foreach (List<ScoreTerm> variants in variantQueries.Values) |
| { |
| if (variants.Count == 1) |
| { |
| //optimize where only one selected variant |
| ScoreTerm st = variants[0]; |
| Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.Term)) : new TermQuery(st.Term, 1); |
| tq.Boost = st.Score; // set the boost to a mix of IDF and score |
| bq.Add(tq, Occur.SHOULD); |
| } |
| else |
| { |
| BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants |
| foreach (ScoreTerm st in variants) |
| { |
| // found a match |
| Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.Term)) : new TermQuery(st.Term, 1); |
| tq.Boost = st.Score; // set the boost using the ScoreTerm's score |
| termVariants.Add(tq, Occur.SHOULD); // add to query |
| } |
| bq.Add(termVariants, Occur.SHOULD); // add to query |
| } |
| } |
| //TODO possible alternative step 3 - organize above booleans into a new layer of field-based |
| // booleans with a minimum-should-match of NumFields-1? |
| bq.Boost = Boost; |
| this.rewrittenQuery = bq; |
| return bq; |
| } |
| |
| //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best |
| // term variants) then is reset with IDF for use in ranking against all other |
| // terms/fields |
| internal class ScoreTerm |
| { |
| public Term Term { get; set; } |
| public float Score { get; set; } |
| internal Term FuzziedSourceTerm { get; set; } |
| |
| public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) |
| { |
| this.Term = term; |
| this.Score = score; |
| this.FuzziedSourceTerm = fuzziedSourceTerm; |
| } |
| } |
| |
| internal class ScoreTermQueue : Util.PriorityQueue<ScoreTerm> |
| { |
| public ScoreTermQueue(int size) |
| : base(size) |
| { |
| } |
| |
| /// <summary> |
| /// (non-Javadoc) |
| /// <see cref="Util.PriorityQueue{T}.LessThan(T, T)"/> |
| /// </summary> |
| #if NETFRAMEWORK |
| [MethodImpl(MethodImplOptions.NoOptimization)] // LUCENENET specific: comparing score equality fails in x86 on .NET Framework with optimizations enabled |
| #endif |
| protected internal override bool LessThan(ScoreTerm termA, ScoreTerm termB) |
| { |
| if (termA.Score == termB.Score) |
| return termA.Term.CompareTo(termB.Term) > 0; |
| else |
| return termA.Score < termB.Score; |
| } |
| |
| } |
| |
| /// <summary> |
| /// (non-Javadoc) |
| /// <see cref="Query.ToString(string)"/> |
| /// </summary> |
| /// <param name="field"></param> |
| /// <returns></returns> |
| public override string ToString(string field) |
| { |
| return null; |
| } |
| |
| public virtual bool IgnoreTF |
| { |
| get => ignoreTF; |
| set => ignoreTF = value; |
| } |
| } |
| } |