| using Lucene.Net.Index; |
| using Lucene.Net.Search; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Globalization; |
| using System.Linq; |
| using System.Text; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Queries |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// A query that executes high-frequency terms in a optional sub-query to prevent |
| /// slow queries due to "common" terms like stopwords. This query |
| /// builds 2 queries off the <see cref="Add(Term)"/> added terms: low-frequency |
| /// terms are added to a required boolean clause and high-frequency terms are |
| /// added to an optional boolean clause. The optional clause is only executed if |
| /// the required "low-frequency" clause matches. Scores produced by this query |
| /// will be slightly different than plain <see cref="BooleanQuery"/> scorer mainly due to |
| /// differences in the <see cref="Search.Similarities.Similarity.Coord(int,int)"/> number of leaf queries |
| /// in the required boolean clause. In most cases, high-frequency terms are |
| /// unlikely to significantly contribute to the document score unless at least |
| /// one of the low-frequency terms are matched. This query can improve |
| /// query execution times significantly if applicable. |
| /// <para> |
| /// <see cref="CommonTermsQuery"/> has several advantages over stopword filtering at |
| /// index or query time since a term can be "classified" based on the actual |
| /// document frequency in the index and can prevent slow queries even across |
| /// domains without specialized stopword files. |
| /// </para> |
| /// <para> |
| /// <b>Note:</b> if the query only contains high-frequency terms the query is |
| /// rewritten into a plain conjunction query ie. all high-frequency terms need to |
| /// match in order to match a document. |
| /// </para> |
| /// <para/> |
| /// Collection initializer note: To create and populate a <see cref="CommonTermsQuery"/> |
| /// in a single statement, you can use the following example as a guide: |
| /// |
| /// <code> |
| /// var query = new CommonTermsQuery() { |
| /// new Term("field", "microsoft"), |
| /// new Term("field", "office") |
| /// }; |
| /// </code> |
| /// </summary> |
| public class CommonTermsQuery : Query, IEnumerable<Term> // LUCENENET specific - implemented IEnumerable<Term>, which allows for use of collection initializer. See: https://stackoverflow.com/a/9195144 |
| { |
| /* |
| * TODO maybe it would make sense to abstract this even further and allow to |
| * rewrite to dismax rather than boolean. Yet, this can already be subclassed |
| * to do so. |
| */ |
| protected readonly IList<Term> m_terms = new List<Term>(); |
| protected readonly bool m_disableCoord; |
| protected readonly float m_maxTermFrequency; |
| protected readonly Occur m_lowFreqOccur; |
| protected readonly Occur m_highFreqOccur; |
| protected float m_lowFreqBoost = 1.0f; |
| protected float m_highFreqBoost = 1.0f; |
| protected float m_lowFreqMinNrShouldMatch = 0; |
| protected float m_highFreqMinNrShouldMatch = 0; |
| |
| /// <summary> |
| /// Creates a new <see cref="CommonTermsQuery"/> |
| /// </summary> |
| /// <param name="highFreqOccur"> |
| /// <see cref="Occur"/> used for high frequency terms </param> |
| /// <param name="lowFreqOccur"> |
| /// <see cref="Occur"/> used for low frequency terms </param> |
| /// <param name="maxTermFrequency"> |
| /// a value in [0..1) (or absolute number >=1) representing the |
| /// maximum threshold of a terms document frequency to be considered a |
| /// low frequency term. </param> |
| /// <exception cref="ArgumentException"> |
| /// if <see cref="Occur.MUST_NOT"/> is pass as <paramref name="lowFreqOccur"/> or |
| /// <paramref name="highFreqOccur"/> </exception> |
| public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency) |
| : this(highFreqOccur, lowFreqOccur, maxTermFrequency, false) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a new <see cref="CommonTermsQuery"/> |
| /// </summary> |
| /// <param name="highFreqOccur"> |
| /// <see cref="Occur"/> used for high frequency terms </param> |
| /// <param name="lowFreqOccur"> |
| /// <see cref="Occur"/> used for low frequency terms </param> |
| /// <param name="maxTermFrequency"> |
| /// a value in [0..1) (or absolute number >=1) representing the |
| /// maximum threshold of a terms document frequency to be considered a |
| /// low frequency term. </param> |
| /// <param name="disableCoord"> |
| /// disables <see cref="Search.Similarities.Similarity.Coord(int,int)"/> in scoring for the low |
| /// / high frequency sub-queries </param> |
| /// <exception cref="ArgumentException"> |
| /// if <see cref="Occur.MUST_NOT"/> is pass as <paramref name="lowFreqOccur"/> or |
| /// <paramref name="highFreqOccur"/> </exception> |
| public CommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, |
| float maxTermFrequency, bool disableCoord) |
| { |
| if (highFreqOccur == Occur.MUST_NOT) |
| { |
| throw new ArgumentException("highFreqOccur should be MUST or SHOULD but was MUST_NOT"); |
| } |
| if (lowFreqOccur == Occur.MUST_NOT) |
| { |
| throw new ArgumentException("lowFreqOccur should be MUST or SHOULD but was MUST_NOT"); |
| } |
| this.m_disableCoord = disableCoord; |
| this.m_highFreqOccur = highFreqOccur; |
| this.m_lowFreqOccur = lowFreqOccur; |
| this.m_maxTermFrequency = maxTermFrequency; |
| } |
| |
| /// <summary> |
| /// Adds a term to the <see cref="CommonTermsQuery"/> |
| /// </summary> |
| /// <param name="term"> |
| /// the term to add </param> |
| public virtual void Add(Term term) |
| { |
| if (term == null) |
| { |
| throw new ArgumentException("Term must not be null"); |
| } |
| this.m_terms.Add(term); |
| } |
| |
| public override Query Rewrite(IndexReader reader) |
| { |
| if (this.m_terms.Count == 0) |
| { |
| return new BooleanQuery(); |
| } |
| else if (this.m_terms.Count == 1) |
| { |
| Query tq = NewTermQuery(this.m_terms[0], null); |
| tq.Boost = Boost; |
| return tq; |
| } |
| var leaves = reader.Leaves; |
| int maxDoc = reader.MaxDoc; |
| var contextArray = new TermContext[m_terms.Count]; |
| var queryTerms = this.m_terms.ToArray(); |
| CollectTermContext(reader, leaves, contextArray, queryTerms); |
| return BuildQuery(maxDoc, contextArray, queryTerms); |
| } |
| |
| protected virtual int CalcLowFreqMinimumNumberShouldMatch(int numOptional) |
| { |
| return MinNrShouldMatch(m_lowFreqMinNrShouldMatch, numOptional); |
| } |
| |
| protected virtual int CalcHighFreqMinimumNumberShouldMatch(int numOptional) |
| { |
| return MinNrShouldMatch(m_highFreqMinNrShouldMatch, numOptional); |
| } |
| |
| private int MinNrShouldMatch(float minNrShouldMatch, int numOptional) |
| { |
| if (minNrShouldMatch >= 1.0f || minNrShouldMatch == 0.0f) |
| { |
| return (int)minNrShouldMatch; |
| } |
| return (int)Math.Round(minNrShouldMatch * numOptional); |
| } |
| |
| protected virtual Query BuildQuery(int maxDoc, TermContext[] contextArray, Term[] queryTerms) |
| { |
| var lowFreq = new BooleanQuery(m_disableCoord); |
| var highFreq = new BooleanQuery(m_disableCoord) { Boost = m_highFreqBoost }; |
| lowFreq.Boost = m_lowFreqBoost; |
| var query = new BooleanQuery(true); |
| for (int i = 0; i < queryTerms.Length; i++) |
| { |
| TermContext termContext = contextArray[i]; |
| if (termContext == null) |
| { |
| lowFreq.Add(NewTermQuery(queryTerms[i], null), m_lowFreqOccur); |
| } |
| else |
| { |
| if ((m_maxTermFrequency >= 1f && termContext.DocFreq > m_maxTermFrequency) || (termContext.DocFreq > (int)Math.Ceiling(m_maxTermFrequency * (float)maxDoc))) |
| { |
| highFreq.Add(NewTermQuery(queryTerms[i], termContext), m_highFreqOccur); |
| } |
| else |
| { |
| lowFreq.Add(NewTermQuery(queryTerms[i], termContext), m_lowFreqOccur); |
| } |
| } |
| |
| } |
| int numLowFreqClauses = lowFreq.GetClauses().Length; |
| int numHighFreqClauses = highFreq.GetClauses().Length; |
| if (m_lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) |
| { |
| int minMustMatch = CalcLowFreqMinimumNumberShouldMatch(numLowFreqClauses); |
| lowFreq.MinimumNumberShouldMatch = minMustMatch; |
| } |
| if (m_highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) |
| { |
| int minMustMatch = CalcHighFreqMinimumNumberShouldMatch(numHighFreqClauses); |
| highFreq.MinimumNumberShouldMatch = minMustMatch; |
| } |
| if (lowFreq.GetClauses().Length == 0) |
| { |
| /* |
| * if lowFreq is empty we rewrite the high freq terms in a conjunction to |
| * prevent slow queries. |
| */ |
| if (highFreq.MinimumNumberShouldMatch == 0 && m_highFreqOccur != Occur.MUST) |
| { |
| foreach (BooleanClause booleanClause in highFreq) |
| { |
| booleanClause.Occur = Occur.MUST; |
| } |
| } |
| highFreq.Boost = Boost; |
| return highFreq; |
| } |
| else if (highFreq.GetClauses().Length == 0) |
| { |
| // only do low freq terms - we don't have high freq terms |
| lowFreq.Boost = Boost; |
| return lowFreq; |
| } |
| else |
| { |
| query.Add(highFreq, Occur.SHOULD); |
| query.Add(lowFreq, Occur.MUST); |
| query.Boost = Boost; |
| return query; |
| } |
| } |
| |
| public virtual void CollectTermContext(IndexReader reader, IList<AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) |
| { |
| TermsEnum termsEnum = null; |
| foreach (AtomicReaderContext context in leaves) |
| { |
| Fields fields = context.AtomicReader.Fields; |
| if (fields == null) |
| { |
| // reader has no fields |
| continue; |
| } |
| for (int i = 0; i < queryTerms.Length; i++) |
| { |
| Term term = queryTerms[i]; |
| TermContext termContext = contextArray[i]; |
| Terms terms = fields.GetTerms(term.Field); |
| if (terms == null) |
| { |
| // field does not exist |
| continue; |
| } |
| termsEnum = terms.GetIterator(termsEnum); |
| Debug.Assert(termsEnum != null); |
| |
| if (termsEnum == TermsEnum.EMPTY) |
| { |
| continue; |
| } |
| if (termsEnum.SeekExact(term.Bytes)) |
| { |
| if (termContext == null) |
| { |
| contextArray[i] = new TermContext(reader.Context, termsEnum.GetTermState(), context.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); |
| } |
| else |
| { |
| termContext.Register(termsEnum.GetTermState(), context.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); |
| } |
| |
| } |
| |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Returns true iff <see cref="Search.Similarities.Similarity.Coord(int,int)"/> is disabled in scoring |
| /// for the high and low frequency query instance. The top level query will |
| /// always disable coords. |
| /// </summary> |
| public virtual bool IsCoordDisabled => m_disableCoord; |
| |
| /// <summary> |
| /// Gets or Sets a minimum number of the low frequent optional BooleanClauses which must be |
| /// satisfied in order to produce a match on the low frequency terms query |
| /// part. This method accepts a float value in the range [0..1) as a fraction |
| /// of the actual query terms in the low frequent clause or a number |
| /// <tt>>=1</tt> as an absolut number of clauses that need to match. |
| /// |
| /// <para> |
| /// By default no optional clauses are necessary for a match (unless there are |
| /// no required clauses). If this method is used, then the specified number of |
| /// clauses is required. |
| /// </para> |
| /// </summary> |
| public virtual float LowFreqMinimumNumberShouldMatch |
| { |
| get => m_lowFreqMinNrShouldMatch; |
| set => m_lowFreqMinNrShouldMatch = value; |
| } |
| |
| |
| /// <summary> |
| /// Gets or Sets a minimum number of the high frequent optional BooleanClauses which must be |
| /// satisfied in order to produce a match on the low frequency terms query |
| /// part. This method accepts a float value in the range [0..1) as a fraction |
| /// of the actual query terms in the low frequent clause or a number |
| /// <tt>>=1</tt> as an absolut number of clauses that need to match. |
| /// |
| /// <para> |
| /// By default no optional clauses are necessary for a match (unless there are |
| /// no required clauses). If this method is used, then the specified number of |
| /// clauses is required. |
| /// </para> |
| /// </summary> |
| public virtual float HighFreqMinimumNumberShouldMatch |
| { |
| get => m_highFreqMinNrShouldMatch; |
| set => m_highFreqMinNrShouldMatch = value; |
| } |
| |
| |
| public override void ExtractTerms(ISet<Term> terms) |
| { |
| terms.UnionWith(this.m_terms); |
| } |
| |
| public override string ToString(string field) |
| { |
| var buffer = new StringBuilder(); |
| bool needParens = (Boost != 1.0) || (LowFreqMinimumNumberShouldMatch > 0); |
| if (needParens) |
| { |
| buffer.Append("("); |
| } |
| for (int i = 0; i < m_terms.Count; i++) |
| { |
| Term t = m_terms[i]; |
| buffer.Append(NewTermQuery(t, null).ToString()); |
| |
| if (i != m_terms.Count - 1) |
| { |
| buffer.Append(", "); |
| } |
| } |
| if (needParens) |
| { |
| buffer.Append(")"); |
| } |
| if (LowFreqMinimumNumberShouldMatch > 0 || HighFreqMinimumNumberShouldMatch > 0) |
| { |
| buffer.Append('~'); |
| buffer.Append("("); |
| buffer.AppendFormat(CultureInfo.InvariantCulture, "{0:0.0#######}", LowFreqMinimumNumberShouldMatch); |
| buffer.AppendFormat(CultureInfo.InvariantCulture, "{0:0.0#######}", HighFreqMinimumNumberShouldMatch); |
| buffer.Append(")"); |
| } |
| if (Boost != 1.0f) |
| { |
| buffer.Append(ToStringUtils.Boost(Boost)); |
| } |
| return buffer.ToString(); |
| } |
| |
| public override int GetHashCode() |
| { |
| const int prime = 31; |
| int result = base.GetHashCode(); |
| result = prime * result + (m_disableCoord ? 1231 : 1237); |
| result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_highFreqBoost); |
| result = prime * result + /*((highFreqOccur == null) ? 0 :*/ m_highFreqOccur.GetHashCode()/*)*/; |
| result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_lowFreqBoost); |
| result = prime * result + /*((lowFreqOccur == null) ? 0 :*/ m_lowFreqOccur.GetHashCode()/*)*/; |
| result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_maxTermFrequency); |
| result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_lowFreqMinNrShouldMatch); |
| result = prime * result + J2N.BitConversion.SingleToInt32Bits(m_highFreqMinNrShouldMatch); |
| // LUCENENET specific: use structural equality comparison |
| result = prime * result + ((m_terms == null) ? 0 : JCG.ListEqualityComparer<Term>.Default.GetHashCode(m_terms)); |
| return result; |
| } |
| |
| public override bool Equals(object obj) |
| { |
| if (this == obj) |
| { |
| return true; |
| } |
| if (!base.Equals(obj)) |
| { |
| return false; |
| } |
| if (this.GetType() != obj.GetType()) |
| { |
| return false; |
| } |
| var other = (CommonTermsQuery)obj; |
| if (m_disableCoord != other.m_disableCoord) |
| { |
| return false; |
| } |
| if (J2N.BitConversion.SingleToInt32Bits(m_highFreqBoost) != J2N.BitConversion.SingleToInt32Bits(other.m_highFreqBoost)) |
| { |
| return false; |
| } |
| if (m_highFreqOccur != other.m_highFreqOccur) |
| { |
| return false; |
| } |
| if (J2N.BitConversion.SingleToInt32Bits(m_lowFreqBoost) != J2N.BitConversion.SingleToInt32Bits(other.m_lowFreqBoost)) |
| { |
| return false; |
| } |
| if (m_lowFreqOccur != other.m_lowFreqOccur) |
| { |
| return false; |
| } |
| if (J2N.BitConversion.SingleToInt32Bits(m_maxTermFrequency) != J2N.BitConversion.SingleToInt32Bits(other.m_maxTermFrequency)) |
| { |
| return false; |
| } |
| if (m_lowFreqMinNrShouldMatch != other.m_lowFreqMinNrShouldMatch) |
| { |
| return false; |
| } |
| if (m_highFreqMinNrShouldMatch != other.m_highFreqMinNrShouldMatch) |
| { |
| return false; |
| } |
| if (m_terms == null) |
| { |
| if (other.m_terms != null) |
| { |
| return false; |
| } |
| } |
| // LUCENENET specific: use structural equality comparison |
| else if (!JCG.ListEqualityComparer<Term>.Default.Equals(m_terms, other.m_terms)) |
| { |
| return false; |
| } |
| return true; |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="TermQuery"/> instance. |
| /// <para>This is intended for subclasses that wish to customize the generated queries.</para> </summary> |
| /// <param name="term"> term </param> |
| /// <param name="context"> the <see cref="TermContext"/> to be used to create the low level term query. Can be <c>null</c>. </param> |
| /// <returns> new <see cref="TermQuery"/> instance </returns> |
| protected virtual Query NewTermQuery(Term term, TermContext context) |
| { |
| return context == null ? new TermQuery(term) : new TermQuery(term, context); |
| } |
| |
| /// <summary> |
| /// Returns an enumerator that iterates through the <see cref="m_terms"/> collection. |
| /// </summary> |
| /// <returns>An enumerator that can be used to iterate through the <see cref="m_terms"/> collection.</returns> |
| // LUCENENET specific |
| public IEnumerator<Term> GetEnumerator() |
| { |
| return this.m_terms.GetEnumerator(); |
| } |
| |
| /// <summary> |
| /// Returns an enumerator that iterates through the <see cref="m_terms"/> collection. |
| /// </summary> |
| /// <returns>An enumerator that can be used to iterate through the <see cref="m_terms"/> collection.</returns> |
| // LUCENENET specific |
| IEnumerator IEnumerable.GetEnumerator() |
| { |
| return GetEnumerator(); |
| } |
| } |
| } |