blob: bb7108e03ac0c897f671950f89352916749f0c0e [file] [log] [blame]
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Search.Similarities;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Sandbox.Queries
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
/// In effect this mixes the behaviour of <see cref="FuzzyQuery"/> and MoreLikeThis but with special consideration
/// of fuzzy scoring factors.
/// This generally produces good results for queries where users may provide details in a number of
/// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
/// a fast query.
/// <para/>
/// For each source term the fuzzy variants are held in a <see cref="BooleanQuery"/> with no coord factor (because
/// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
/// <see cref="TermQuery"/> is used for variants and does not use that variant term's IDF because this would favour rarer
/// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
/// term) and this is factored into the variant's boost. If the source query term does not exist in the
/// index the average IDF of the variants is used.
/// </summary>
public class FuzzyLikeThisQuery : Query
{
// TODO: generalize this query (at least it should not reuse this static sim!
// a better way might be to convert this into multitermquery rewrite methods.
// the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq)
// provided to TermQuery, so that the general idea is agnostic to any scoring system...
internal static TFIDFSimilarity sim = new DefaultSimilarity();
private Query rewrittenQuery = null;
private readonly IList<FieldVals> fieldVals = new JCG.List<FieldVals>();
private readonly Analyzer analyzer;
private readonly ScoreTermQueue q;
private const int MAX_VARIANTS_PER_TERM = 50;
private bool ignoreTF = false;
private readonly int maxNumTerms;
public override int GetHashCode()
{
int prime = 31;
int result = base.GetHashCode();
result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode());
result = prime * result
+ ((fieldVals == null) ? 0 : fieldVals.GetHashCode());
result = prime * result + (ignoreTF ? 1231 : 1237);
result = prime * result + maxNumTerms;
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
return true;
if (obj == null)
return false;
if (GetType() != obj.GetType())
return false;
if (!base.Equals(obj))
{
return false;
}
FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj;
if (analyzer == null)
{
if (other.analyzer != null)
return false;
}
else if (!analyzer.Equals(other.analyzer))
return false;
if (fieldVals == null)
{
if (other.fieldVals != null)
return false;
}
else if (!fieldVals.Equals(other.fieldVals))
return false;
if (ignoreTF != other.ignoreTF)
return false;
if (maxNumTerms != other.maxNumTerms)
return false;
return true;
}
/// <summary>
///
/// </summary>
/// <param name="maxNumTerms">The total number of terms clauses that will appear once rewritten as a <see cref="BooleanQuery"/></param>
/// <param name="analyzer"></param>
public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
{
q = new ScoreTermQueue(maxNumTerms);
this.analyzer = analyzer;
this.maxNumTerms = maxNumTerms;
}
internal class FieldVals
{
internal string queryString;
internal string fieldName;
internal float minSimilarity;
internal int prefixLength;
public FieldVals(string name, float similarity, int length, string queryString)
{
fieldName = name;
minSimilarity = similarity;
prefixLength = length;
this.queryString = queryString;
}
public override int GetHashCode()
{
int prime = 31;
int result = 1;
result = prime * result
+ ((fieldName == null) ? 0 : fieldName.GetHashCode());
result = prime * result + J2N.BitConversion.SingleToInt32Bits(minSimilarity);
result = prime * result + prefixLength;
result = prime * result
+ ((queryString == null) ? 0 : queryString.GetHashCode());
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
return true;
if (obj == null)
return false;
if (GetType() != obj.GetType())
return false;
FieldVals other = (FieldVals)obj;
if (fieldName == null)
{
if (other.fieldName != null)
return false;
}
else if (!fieldName.Equals(other.fieldName, StringComparison.Ordinal))
return false;
if (J2N.BitConversion.SingleToInt32Bits(minSimilarity) != J2N.BitConversion
.SingleToInt32Bits(other.minSimilarity))
return false;
if (prefixLength != other.prefixLength)
return false;
if (queryString == null)
{
if (other.queryString != null)
return false;
}
else if (!queryString.Equals(other.queryString, StringComparison.Ordinal))
return false;
return true;
}
}
/// <summary>
/// Adds user input for "fuzzification"
/// </summary>
/// <param name="queryString">The string which will be parsed by the analyzer and for which fuzzy variants will be parsed</param>
/// <param name="fieldName">The minimum similarity of the term variants (see <see cref="FuzzyTermsEnum"/>)</param>
/// <param name="minSimilarity">Length of required common prefix on variant terms (see <see cref="FuzzyTermsEnum"/>)</param>
/// <param name="prefixLength"></param>
public virtual void AddTerms(string queryString, string fieldName, float minSimilarity, int prefixLength)
{
fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString));
}
private void AddTerms(IndexReader reader, FieldVals f)
{
if (f.queryString == null) return;
Terms terms = MultiFields.GetTerms(reader, f.fieldName);
if (terms == null)
{
return;
}
TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString);
try
{
ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>();
int corpusNumDocs = reader.NumDocs;
ISet<string> processedTerms = new JCG.HashSet<string>();
ts.Reset();
while (ts.IncrementToken())
{
string term = termAtt.ToString();
if (!processedTerms.Contains(term))
{
processedTerms.Add(term);
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
AttributeSource atts = new AttributeSource();
IMaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
//store the df so all variants use same idf
int df = reader.DocFreq(startTerm);
int numVariants = 0;
int totalVariantDocFreqs = 0;
BytesRef possibleMatch;
IBoostAttribute boostAtt =
fe.Attributes.AddAttribute<IBoostAttribute>();
while (fe.MoveNext())
{
possibleMatch = fe.Term;
numVariants++;
totalVariantDocFreqs += fe.DocFreq;
float score = boostAtt.Boost;
if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore)
{
ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
variantsQ.InsertWithOverflow(st);
minScore = variantsQ.Top.Score; // maintain minScore
}
maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
}
if (numVariants > 0)
{
int avgDf = totalVariantDocFreqs / numVariants;
if (df == 0)//no direct match we can use as df for all variants
{
df = avgDf; //use avg df of all variants
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking
// overall top query terms
int size = variantsQ.Count;
for (int i = 0; i < size; i++)
{
ScoreTerm st = variantsQ.Pop();
st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
q.InsertWithOverflow(st);
}
}
}
}
ts.End();
}
finally
{
IOUtils.DisposeWhileHandlingException(ts);
}
}
public override Query Rewrite(IndexReader reader)
{
if (rewrittenQuery != null)
{
return rewrittenQuery;
}
//load up the list of possible terms
foreach (var f in fieldVals)
{
AddTerms(reader, f);
}
//clear the list of fields
fieldVals.Clear();
BooleanQuery bq = new BooleanQuery();
//create BooleanQueries to hold the variants for each token/field pair and ensure it
// has no coord factor
//Step 1: sort the termqueries by term/field
IDictionary<Term, List<ScoreTerm>> variantQueries = new Dictionary<Term, List<ScoreTerm>>();
int size = q.Count;
for (int i = 0; i < size; i++)
{
ScoreTerm st = q.Pop();
//List<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm);
// if(l==null)
List<ScoreTerm> l;
if (!variantQueries.TryGetValue(st.FuzziedSourceTerm, out l) || l == null)
{
l = new List<ScoreTerm>();
variantQueries[st.FuzziedSourceTerm] = l;
}
l.Add(st);
}
//Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
foreach (List<ScoreTerm> variants in variantQueries.Values)
{
if (variants.Count == 1)
{
//optimize where only one selected variant
ScoreTerm st = variants[0];
Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.Term)) : new TermQuery(st.Term, 1);
tq.Boost = st.Score; // set the boost to a mix of IDF and score
bq.Add(tq, Occur.SHOULD);
}
else
{
BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
foreach (ScoreTerm st in variants)
{
// found a match
Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.Term)) : new TermQuery(st.Term, 1);
tq.Boost = st.Score; // set the boost using the ScoreTerm's score
termVariants.Add(tq, Occur.SHOULD); // add to query
}
bq.Add(termVariants, Occur.SHOULD); // add to query
}
}
//TODO possible alternative step 3 - organize above booleans into a new layer of field-based
// booleans with a minimum-should-match of NumFields-1?
bq.Boost = Boost;
this.rewrittenQuery = bq;
return bq;
}
//Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
// term variants) then is reset with IDF for use in ranking against all other
// terms/fields
internal class ScoreTerm
{
public Term Term { get; set; }
public float Score { get; set; }
internal Term FuzziedSourceTerm { get; set; }
public ScoreTerm(Term term, float score, Term fuzziedSourceTerm)
{
this.Term = term;
this.Score = score;
this.FuzziedSourceTerm = fuzziedSourceTerm;
}
}
internal class ScoreTermQueue : Util.PriorityQueue<ScoreTerm>
{
public ScoreTermQueue(int size)
: base(size)
{
}
/// <summary>
/// (non-Javadoc)
/// <see cref="Util.PriorityQueue{T}.LessThan(T, T)"/>
/// </summary>
#if NETFRAMEWORK
[MethodImpl(MethodImplOptions.NoOptimization)] // LUCENENET specific: comparing score equality fails in x86 on .NET Framework with optimizations enabled
#endif
protected internal override bool LessThan(ScoreTerm termA, ScoreTerm termB)
{
if (termA.Score == termB.Score)
return termA.Term.CompareTo(termB.Term) > 0;
else
return termA.Score < termB.Score;
}
}
/// <summary>
/// (non-Javadoc)
/// <see cref="Query.ToString(string)"/>
/// </summary>
/// <param name="field"></param>
/// <returns></returns>
public override string ToString(string field)
{
return null;
}
public virtual bool IgnoreTF
{
get => ignoreTF;
set => ignoreTF = value;
}
}
}