blob: c6341d68fbc794aa849d1808e2e13d3eda68da64 [file] [log] [blame]
using System.IO;
namespace Lucene.Net.Search.Similarities
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// javadoc
using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
// javadoc
using BytesRef = Lucene.Net.Util.BytesRef;
using FieldInvertState = Lucene.Net.Index.FieldInvertState;
// javadoc
/// <summary>
/// Similarity defines the components of Lucene scoring.
/// <para/>
/// Expert: Scoring API.
/// <para/>
/// This is a low-level API, you should only extend this API if you want to implement
/// an information retrieval <i>model</i>. If you are instead looking for a convenient way
/// to alter Lucene's scoring, consider extending a higher-level implementation
/// such as <see cref="TFIDFSimilarity"/>, which implements the vector space model with this API, or
/// just tweaking the default implementation: <see cref="DefaultSimilarity"/>.
/// <para/>
/// Similarity determines how Lucene weights terms, and Lucene interacts with
/// this class at both <a href="#indextime">index-time</a> and
/// <a href="#querytime">query-time</a>.
/// <para/>
/// <a name="indextime"/>
/// At indexing time, the indexer calls <see cref="ComputeNorm(FieldInvertState)"/>, allowing
/// the <see cref="Similarity"/> implementation to set a per-document value for the field that will
/// be later accessible via <see cref="Index.AtomicReader.GetNormValues(string)"/>. Lucene makes no assumption
/// about what is in this norm, but it is most useful for encoding length normalization
/// information.
/// <para/>
/// Implementations should carefully consider how the normalization is encoded: while
/// Lucene's classical <see cref="TFIDFSimilarity"/> encodes a combination of index-time boost
/// and length normalization information with <see cref="Util.SmallSingle"/> into a single byte, this
/// might not be suitable for all purposes.
/// <para/>
/// Many formulas require the use of average document length, which can be computed via a
/// combination of <see cref="CollectionStatistics.SumTotalTermFreq"/> and
/// <see cref="CollectionStatistics.MaxDoc"/> or <see cref="CollectionStatistics.DocCount"/>,
/// depending upon whether the average should reflect field sparsity.
/// <para/>
/// Additional scoring factors can be stored in named
/// <see cref="Documents.NumericDocValuesField"/>s and accessed
/// at query-time with <see cref="Index.AtomicReader.GetNumericDocValues(string)"/>.
/// <para/>
/// Finally, using index-time boosts (either via folding into the normalization byte or
/// via <see cref="Index.DocValues"/>), is an inefficient way to boost the scores of different fields if the
/// boost will be the same for every document, instead the Similarity can simply take a constant
/// boost parameter <i>C</i>, and <see cref="PerFieldSimilarityWrapper"/> can return different
/// instances with different boosts depending upon field name.
/// <para/>
/// <a name="querytime"/>
/// At query-time, Queries interact with the Similarity via these steps:
/// <list type="number">
/// <item><description>The <see cref="ComputeWeight(float, CollectionStatistics, TermStatistics[])"/> method is called a single time,
/// allowing the implementation to compute any statistics (such as IDF, average document length, etc)
/// across <i>the entire collection</i>. The <see cref="TermStatistics"/> and <see cref="CollectionStatistics"/> passed in
/// already contain all of the raw statistics involved, so a <see cref="Similarity"/> can freely use any combination
/// of statistics without causing any additional I/O. Lucene makes no assumption about what is
/// stored in the returned <see cref="Similarity.SimWeight"/> object.</description></item>
/// <item><description>The query normalization process occurs a single time: <see cref="Similarity.SimWeight.GetValueForNormalization()"/>
/// is called for each query leaf node, <see cref="Similarity.QueryNorm(float)"/> is called for the top-level
/// query, and finally <see cref="Similarity.SimWeight.Normalize(float, float)"/> passes down the normalization value
/// and any top-level boosts (e.g. from enclosing <see cref="BooleanQuery"/>s).</description></item>
/// <item><description>For each segment in the index, the <see cref="Query"/> creates a <see cref="GetSimScorer(SimWeight, AtomicReaderContext)"/>
/// The GetScore() method is called for each matching document.</description></item>
/// </list>
/// <para/>
/// <a name="explaintime"/>
/// When <see cref="IndexSearcher.Explain(Lucene.Net.Search.Query, int)"/> is called, queries consult the Similarity's DocScorer for an
/// explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
/// was computed.
/// <para/>
/// @lucene.experimental
/// </summary>
/// <seealso cref="Lucene.Net.Index.IndexWriterConfig.Similarity"/>
/// <seealso cref="IndexSearcher.Similarity"/>
public abstract class Similarity
{
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
protected Similarity() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
{
}
/// <summary>
/// Hook to integrate coordinate-level matching.
/// <para/>
/// By default this is disabled (returns <c>1</c>), as with
/// most modern models this will only skew performance, but some
/// implementations such as <see cref="TFIDFSimilarity"/> override this.
/// </summary>
/// <param name="overlap"> the number of query terms matched in the document </param>
/// <param name="maxOverlap"> the total number of terms in the query </param>
/// <returns> a score factor based on term overlap with the query </returns>
public virtual float Coord(int overlap, int maxOverlap)
{
return 1f;
}
/// <summary>
/// Computes the normalization value for a query given the sum of the
/// normalized weights <see cref="SimWeight.GetValueForNormalization()"/> of
/// each of the query terms. this value is passed back to the
/// weight (<see cref="SimWeight.Normalize(float, float)"/> of each query
/// term, to provide a hook to attempt to make scores from different
/// queries comparable.
/// <para/>
/// By default this is disabled (returns <c>1</c>), but some
/// implementations such as <see cref="TFIDFSimilarity"/> override this.
/// </summary>
/// <param name="valueForNormalization"> the sum of the term normalization values </param>
/// <returns> a normalization factor for query weights </returns>
public virtual float QueryNorm(float valueForNormalization)
{
return 1f;
}
/// <summary>
/// Computes the normalization value for a field, given the accumulated
/// state of term processing for this field (see <see cref="FieldInvertState"/>).
///
/// <para/>Matches in longer fields are less precise, so implementations of this
/// method usually set smaller values when <c>state.Length</c> is large,
/// and larger values when <code>state.Length</code> is small.
/// <para/>
/// @lucene.experimental
/// </summary>
/// <param name="state"> current processing state for this field </param>
/// <returns> computed norm value </returns>
public abstract long ComputeNorm(FieldInvertState state);
/// <summary>
/// Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring a query.
/// </summary>
/// <param name="queryBoost"> the query-time boost. </param>
/// <param name="collectionStats"> collection-level statistics, such as the number of tokens in the collection. </param>
/// <param name="termStats"> term-level statistics, such as the document frequency of a term across the collection. </param>
/// <returns> <see cref="SimWeight"/> object with the information this <see cref="Similarity"/> needs to score a query. </returns>
public abstract SimWeight ComputeWeight(float queryBoost, CollectionStatistics collectionStats, params TermStatistics[] termStats);
/// <summary>
/// Creates a new <see cref="Similarity.SimScorer"/> to score matching documents from a segment of the inverted index. </summary>
/// <param name="weight"> collection information from <see cref="ComputeWeight(float, CollectionStatistics, TermStatistics[])"/> </param>
/// <param name="context"> segment of the inverted index to be scored. </param>
/// <returns> Sloppy <see cref="SimScorer"/> for scoring documents across <c>context</c> </returns>
/// <exception cref="IOException"> if there is a low-level I/O error </exception>
public abstract SimScorer GetSimScorer(SimWeight weight, AtomicReaderContext context);
/// <summary>
/// API for scoring "sloppy" queries such as <see cref="TermQuery"/>,
/// <see cref="Spans.SpanQuery"/>, and <see cref="PhraseQuery"/>.
/// <para/>
/// Frequencies are floating-point values: an approximate
/// within-document frequency adjusted for "sloppiness" by
/// <see cref="SimScorer.ComputeSlopFactor(int)"/>.
/// </summary>
public abstract class SimScorer
{
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
protected SimScorer() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
{
}
/// <summary>
/// Score a single document </summary>
/// <param name="doc"> document id within the inverted index segment </param>
/// <param name="freq"> sloppy term frequency </param>
/// <returns> document's score </returns>
public abstract float Score(int doc, float freq);
/// <summary>
/// Computes the amount of a sloppy phrase match, based on an edit distance. </summary>
public abstract float ComputeSlopFactor(int distance);
/// <summary>
/// Calculate a scoring factor based on the data in the payload. </summary>
public abstract float ComputePayloadFactor(int doc, int start, int end, BytesRef payload);
/// <summary>
/// Explain the score for a single document </summary>
/// <param name="doc"> document id within the inverted index segment </param>
/// <param name="freq"> Explanation of how the sloppy term frequency was computed </param>
/// <returns> document's score </returns>
public virtual Explanation Explain(int doc, Explanation freq)
{
Explanation result = new Explanation(Score(doc, freq.Value), "score(doc=" + doc + ",freq=" + freq.Value + "), with freq of:");
result.AddDetail(freq);
return result;
}
}
/// <summary>
/// Stores the weight for a query across the indexed collection. this abstract
/// implementation is empty; descendants of <see cref="Similarity"/> should
/// subclass <see cref="SimWeight"/> and define the statistics they require in the
/// subclass. Examples include idf, average field length, etc.
/// </summary>
public abstract class SimWeight
{
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
protected SimWeight() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
{
}
/// <summary>
/// The value for normalization of contained query clauses (e.g. sum of squared weights).
/// <para/>
/// NOTE: a <see cref="Similarity"/> implementation might not use any query normalization at all,
/// its not required. However, if it wants to participate in query normalization,
/// it can return a value here.
/// </summary>
public abstract float GetValueForNormalization();
/// <summary>
/// Assigns the query normalization factor and boost from parent queries to this.
/// <para/>
/// NOTE: a <see cref="Similarity"/> implementation might not use this normalized value at all,
/// its not required. However, its usually a good idea to at least incorporate
/// the <paramref name="topLevelBoost"/> (e.g. from an outer <see cref="BooleanQuery"/>) into its score.
/// </summary>
public abstract void Normalize(float queryNorm, float topLevelBoost);
}
}
}