blob: 60e8ac731d030d0284b6ceb73707857ae8cd5387 [file] [log] [blame]
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Util;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Queries
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A query that executes high-frequency terms in a optional sub-query to prevent
/// slow queries due to "common" terms like stopwords. This query
/// builds 2 queries off the <see cref="Add(Term)"/> added terms: low-frequency
/// terms are added to a required boolean clause and high-frequency terms are
/// added to an optional boolean clause. The optional clause is only executed if
/// the required "low-frequency" clause matches. Scores produced by this query
/// will be slightly different than plain <see cref="BooleanQuery"/> scorer mainly due to
/// differences in the <see cref="Search.Similarities.Similarity.Coord(int,int)"/> number of leaf queries
/// in the required boolean clause. In most cases, high-frequency terms are
/// unlikely to significantly contribute to the document score unless at least
/// one of the low-frequency terms are matched. This query can improve
/// query execution times significantly if applicable.
/// <para>
/// <see cref="CommonTermsQuery"/> has several advantages over stopword filtering at
/// index or query time since a term can be "classified" based on the actual
/// document frequency in the index and can prevent slow queries even across
/// domains without specialized stopword files.
/// </para>
/// <para>
/// <b>Note:</b> if the query only contains high-frequency terms the query is
/// rewritten into a plain conjunction query ie. all high-frequency terms need to
/// match in order to match a document.
/// </para>
/// <para/>
/// Collection initializer note: To create and populate a <see cref="CommonTermsQuery"/>
/// in a single statement, you can use the following example as a guide:
///
/// <code>
/// var query = new CommonTermsQuery() {
/// new Term("field", "microsoft"),
/// new Term("field", "office")
/// };
/// </code>
/// </summary>
public class CommonTermsQuery : Query, IEnumerable<Term> // LUCENENET specific - implemented IEnumerable<Term>, which allows for use of collection initializer. See: https://stackoverflow.com/a/9195144
{
/*
* TODO maybe it would make sense to abstract this even further and allow to
* rewrite to dismax rather than boolean. Yet, this can already be subclassed
* to do so.
*/
protected readonly IList<Term> m_terms = new List<Term>();
protected readonly bool m_disableCoord;
protected readonly float m_maxTermFrequency;
protected readonly Occur m_lowFreqOccur;
protected readonly Occur m_highFreqOccur;
protected float m_lowFreqBoost = 1.0f;
protected float m_highFreqBoost = 1.0f;
protected float m_lowFreqMinNrShouldMatch = 0;
protected float m_highFreqMinNrShouldMatch = 0;
/// <summary>
/// Creates a new <see cref="CommonTermsQuery"/>
/// </summary>
/// <param name="highFreqOccur">
/// <see cref="Occur"/> used for high frequency terms </param>
/// <param name="lowFreqOccur">
/// <see cref="Occur"/> used for low frequency terms </param>
/// <param name="maxTermFrequency">
/// a value in [0..1) (or absolute number >=1) representing the
/// maximum threshold of a terms document frequency to be considered a
/// low frequency term. </param>
/// <exception cref="ArgumentException">
/// if <see cref="Occur.MUST_NOT"/> is pass as <paramref name="lowFreqOccur"/> or
/// <paramref name="highFreqOccur"/> </exception>
public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency)
: this(highFreqOccur, lowFreqOccur, maxTermFrequency, false)
{
}
/// <summary>
/// Creates a new <see cref="CommonTermsQuery"/>
/// </summary>
/// <param name="highFreqOccur">
/// <see cref="Occur"/> used for high frequency terms </param>
/// <param name="lowFreqOccur">
/// <see cref="Occur"/> used for low frequency terms </param>
/// <param name="maxTermFrequency">
/// a value in [0..1) (or absolute number >=1) representing the
/// maximum threshold of a terms document frequency to be considered a
/// low frequency term. </param>
/// <param name="disableCoord">
/// disables <see cref="Search.Similarities.Similarity.Coord(int,int)"/> in scoring for the low
/// / high frequency sub-queries </param>
/// <exception cref="ArgumentException">
/// if <see cref="Occur.MUST_NOT"/> is pass as <paramref name="lowFreqOccur"/> or
/// <paramref name="highFreqOccur"/> </exception>
public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur,
float maxTermFrequency, bool disableCoord)
{
if (highFreqOccur == Occur.MUST_NOT)
{
throw new System.ArgumentException("highFreqOccur should be MUST or SHOULD but was MUST_NOT");
}
if (lowFreqOccur == Occur.MUST_NOT)
{
throw new System.ArgumentException("lowFreqOccur should be MUST or SHOULD but was MUST_NOT");
}
this.m_disableCoord = disableCoord;
this.m_highFreqOccur = highFreqOccur;
this.m_lowFreqOccur = lowFreqOccur;
this.m_maxTermFrequency = maxTermFrequency;
}
/// <summary>
/// Adds a term to the <see cref="CommonTermsQuery"/>
/// </summary>
/// <param name="term">
/// the term to add </param>
public virtual void Add(Term term)
{
if (term == null)
{
throw new ArgumentException("Term must not be null");
}
this.m_terms.Add(term);
}
public override Query Rewrite(IndexReader reader)
{
if (this.m_terms.Count == 0)
{
return new BooleanQuery();
}
else if (this.m_terms.Count == 1)
{
Query tq = NewTermQuery(this.m_terms[0], null);
tq.Boost = Boost;
return tq;
}
var leaves = reader.Leaves;
int maxDoc = reader.MaxDoc;
var contextArray = new TermContext[m_terms.Count];
var queryTerms = this.m_terms.ToArray();
CollectTermContext(reader, leaves, contextArray, queryTerms);
return BuildQuery(maxDoc, contextArray, queryTerms);
}
protected virtual int CalcLowFreqMinimumNumberShouldMatch(int numOptional)
{
return MinNrShouldMatch(m_lowFreqMinNrShouldMatch, numOptional);
}
protected virtual int CalcHighFreqMinimumNumberShouldMatch(int numOptional)
{
return MinNrShouldMatch(m_highFreqMinNrShouldMatch, numOptional);
}
private int MinNrShouldMatch(float minNrShouldMatch, int numOptional)
{
if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f)
{
return (int)minNrShouldMatch;
}
return (int)Math.Round(minNrShouldMatch * numOptional);
}
protected virtual Query BuildQuery(int maxDoc, TermContext[] contextArray, Term[] queryTerms)
{
var lowFreq = new BooleanQuery(m_disableCoord);
var highFreq = new BooleanQuery(m_disableCoord) { Boost = m_highFreqBoost };
lowFreq.Boost = m_lowFreqBoost;
var query = new BooleanQuery(true);
for (int i = 0; i < queryTerms.Length; i++)
{
TermContext termContext = contextArray[i];
if (termContext == null)
{
lowFreq.Add(NewTermQuery(queryTerms[i], null), m_lowFreqOccur);
}
else
{
if ((m_maxTermFrequency >= 1f && termContext.DocFreq > m_maxTermFrequency) || (termContext.DocFreq > (int)Math.Ceiling(m_maxTermFrequency * (float)maxDoc)))
{
highFreq.Add(NewTermQuery(queryTerms[i], termContext), m_highFreqOccur);
}
else
{
lowFreq.Add(NewTermQuery(queryTerms[i], termContext), m_lowFreqOccur);
}
}
}
int numLowFreqClauses = lowFreq.GetClauses().Length;
int numHighFreqClauses = highFreq.GetClauses().Length;
if (m_lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0)
{
int minMustMatch = CalcLowFreqMinimumNumberShouldMatch(numLowFreqClauses);
lowFreq.MinimumNumberShouldMatch = minMustMatch;
}
if (m_highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0)
{
int minMustMatch = CalcHighFreqMinimumNumberShouldMatch(numHighFreqClauses);
highFreq.MinimumNumberShouldMatch = minMustMatch;
}
if (lowFreq.GetClauses().Length == 0)
{
/*
* if lowFreq is empty we rewrite the high freq terms in a conjunction to
* prevent slow queries.
*/
if (highFreq.MinimumNumberShouldMatch == 0 && m_highFreqOccur != Occur.MUST)
{
foreach (BooleanClause booleanClause in highFreq)
{
booleanClause.Occur = Occur.MUST;
}
}
highFreq.Boost = Boost;
return highFreq;
}
else if (highFreq.GetClauses().Length == 0)
{
// only do low freq terms - we don't have high freq terms
lowFreq.Boost = Boost;
return lowFreq;
}
else
{
query.Add(highFreq, Occur.SHOULD);
query.Add(lowFreq, Occur.MUST);
query.Boost = Boost;
return query;
}
}
public virtual void CollectTermContext(IndexReader reader, IList<AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms)
{
TermsEnum termsEnum = null;
foreach (AtomicReaderContext context in leaves)
{
Fields fields = context.AtomicReader.Fields;
if (fields == null)
{
// reader has no fields
continue;
}
for (int i = 0; i < queryTerms.Length; i++)
{
Term term = queryTerms[i];
TermContext termContext = contextArray[i];
Terms terms = fields.GetTerms(term.Field);
if (terms == null)
{
// field does not exist
continue;
}
termsEnum = terms.GetIterator(termsEnum);
Debug.Assert(termsEnum != null);
if (termsEnum == TermsEnum.EMPTY)
{
continue;
}
if (termsEnum.SeekExact(term.Bytes))
{
if (termContext == null)
{
contextArray[i] = new TermContext(reader.Context, termsEnum.GetTermState(), context.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
}
else
{
termContext.Register(termsEnum.GetTermState(), context.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
}
}
}
}
}
/// <summary>
/// Returns true iff <see cref="Search.Similarities.Similarity.Coord(int,int)"/> is disabled in scoring
/// for the high and low frequency query instance. The top level query will
/// always disable coords.
/// </summary>
public virtual bool IsCoordDisabled => m_disableCoord;
/// <summary>
/// Gets or Sets a minimum number of the low frequent optional BooleanClauses which must be
/// satisfied in order to produce a match on the low frequency terms query
/// part. This method accepts a float value in the range [0..1) as a fraction
/// of the actual query terms in the low frequent clause or a number
/// <tt>&gt;=1</tt> as an absolut number of clauses that need to match.
///
/// <para>
/// By default no optional clauses are necessary for a match (unless there are
/// no required clauses). If this method is used, then the specified number of
/// clauses is required.
/// </para>
/// </summary>
public virtual float LowFreqMinimumNumberShouldMatch
{
get { return m_lowFreqMinNrShouldMatch; }
set { m_lowFreqMinNrShouldMatch = value; }
}
/// <summary>
/// Gets or Sets a minimum number of the high frequent optional BooleanClauses which must be
/// satisfied in order to produce a match on the low frequency terms query
/// part. This method accepts a float value in the range [0..1) as a fraction
/// of the actual query terms in the low frequent clause or a number
/// <tt>&gt;=1</tt> as an absolut number of clauses that need to match.
///
/// <para>
/// By default no optional clauses are necessary for a match (unless there are
/// no required clauses). If this method is used, then the specified number of
/// clauses is required.
/// </para>
/// </summary>
public virtual float HighFreqMinimumNumberShouldMatch
{
get { return m_highFreqMinNrShouldMatch; }
set { m_highFreqMinNrShouldMatch = value; }
}
public override void ExtractTerms(ISet<Term> terms)
{
terms.UnionWith(this.m_terms);
}
public override string ToString(string field)
{
var buffer = new StringBuilder();
bool needParens = (Boost != 1.0) || (LowFreqMinimumNumberShouldMatch > 0);
if (needParens)
{
buffer.Append("(");
}
for (int i = 0; i < m_terms.Count; i++)
{
Term t = m_terms[i];
buffer.Append(NewTermQuery(t, null).ToString());
if (i != m_terms.Count - 1)
{
buffer.Append(", ");
}
}
if (needParens)
{
buffer.Append(")");
}
if (LowFreqMinimumNumberShouldMatch > 0 || HighFreqMinimumNumberShouldMatch > 0)
{
buffer.Append('~');
buffer.Append("(");
buffer.AppendFormat(CultureInfo.InvariantCulture, "{0:0.0#######}", LowFreqMinimumNumberShouldMatch);
buffer.AppendFormat(CultureInfo.InvariantCulture, "{0:0.0#######}", HighFreqMinimumNumberShouldMatch);
buffer.Append(")");
}
if (Boost != 1.0f)
{
buffer.Append(ToStringUtils.Boost(Boost));
}
return buffer.ToString();
}
public override int GetHashCode()
{
const int prime = 31;
int result = base.GetHashCode();
result = prime * result + (m_disableCoord ? 1231 : 1237);
result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_highFreqBoost);
result = prime * result + /*((highFreqOccur == null) ? 0 :*/ m_highFreqOccur.GetHashCode()/*)*/;
result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_lowFreqBoost);
result = prime * result + /*((lowFreqOccur == null) ? 0 :*/ m_lowFreqOccur.GetHashCode()/*)*/;
result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_maxTermFrequency);
result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_lowFreqMinNrShouldMatch);
result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_highFreqMinNrShouldMatch);
// LUCENENET specific: use structural equality comparison
result = prime * result + ((m_terms == null) ? 0 : JCG.ListEqualityComparer<Term>.Default.GetHashCode(m_terms));
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (!base.Equals(obj))
{
return false;
}
if (this.GetType() != obj.GetType())
{
return false;
}
var other = (CommonTermsQuery)obj;
if (m_disableCoord != other.m_disableCoord)
{
return false;
}
if (J2N.BitConversion.SingleToInt32Bits(m_highFreqBoost) != J2N.BitConversion.SingleToInt32Bits(other.m_highFreqBoost))
{
return false;
}
if (m_highFreqOccur != other.m_highFreqOccur)
{
return false;
}
if (J2N.BitConversion.SingleToInt32Bits(m_lowFreqBoost) != J2N.BitConversion.SingleToInt32Bits(other.m_lowFreqBoost))
{
return false;
}
if (m_lowFreqOccur != other.m_lowFreqOccur)
{
return false;
}
if (J2N.BitConversion.SingleToInt32Bits(m_maxTermFrequency) != J2N.BitConversion.SingleToInt32Bits(other.m_maxTermFrequency))
{
return false;
}
if (m_lowFreqMinNrShouldMatch != other.m_lowFreqMinNrShouldMatch)
{
return false;
}
if (m_highFreqMinNrShouldMatch != other.m_highFreqMinNrShouldMatch)
{
return false;
}
if (m_terms == null)
{
if (other.m_terms != null)
{
return false;
}
}
// LUCENENET specific: use structural equality comparison
else if (!JCG.ListEqualityComparer<Term>.Default.Equals(m_terms, other.m_terms))
{
return false;
}
return true;
}
/// <summary>
/// Builds a new <see cref="TermQuery"/> instance.
/// <para>This is intended for subclasses that wish to customize the generated queries.</para> </summary>
/// <param name="term"> term </param>
/// <param name="context"> the <see cref="TermContext"/> to be used to create the low level term query. Can be <c>null</c>. </param>
/// <returns> new <see cref="TermQuery"/> instance </returns>
protected virtual Query NewTermQuery(Term term, TermContext context)
{
return context == null ? new TermQuery(term) : new TermQuery(term, context);
}
/// <summary>
/// Returns an enumerator that iterates through the <see cref="m_terms"/> collection.
/// </summary>
/// <returns>An enumerator that can be used to iterate through the <see cref="m_terms"/> collection.</returns>
// LUCENENET specific
public IEnumerator<Term> GetEnumerator()
{
return this.m_terms.GetEnumerator();
}
/// <summary>
/// Returns an enumerator that iterates through the <see cref="m_terms"/> collection.
/// </summary>
/// <returns>An enumerator that can be used to iterate through the <see cref="m_terms"/> collection.</returns>
// LUCENENET specific
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
}