src/Lucene.Net/Search/Similarities/Similarity.cs - lucenenet - Git at Google

 using System.IO;

 namespace Lucene.Net.Search.Similarities
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     // javadoc
     using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;

     // javadoc
     using BytesRef = Lucene.Net.Util.BytesRef;
     using FieldInvertState = Lucene.Net.Index.FieldInvertState;

     // javadoc

     /// <summary>
     /// Similarity defines the components of Lucene scoring.
     /// <para/>
     /// Expert: Scoring API.
     /// <para/>
     /// This is a low-level API, you should only extend this API if you want to implement
     /// an information retrieval <i>model</i>.  If you are instead looking for a convenient way
     /// to alter Lucene's scoring, consider extending a higher-level implementation
     /// such as <see cref="TFIDFSimilarity"/>, which implements the vector space model with this API, or
     /// just tweaking the default implementation: <see cref="DefaultSimilarity"/>.
     /// <para/>
     /// Similarity determines how Lucene weights terms, and Lucene interacts with
     /// this class at both <a href="#indextime">index-time</a> and
     /// <a href="#querytime">query-time</a>.
     /// <para/>
     /// <a name="indextime"/>
     /// At indexing time, the indexer calls <see cref="ComputeNorm(FieldInvertState)"/>, allowing
     /// the <see cref="Similarity"/> implementation to set a per-document value for the field that will
     /// be later accessible via <see cref="Index.AtomicReader.GetNormValues(string)"/>.  Lucene makes no assumption
     /// about what is in this norm, but it is most useful for encoding length normalization
     /// information.
     /// <para/>
     /// Implementations should carefully consider how the normalization is encoded: while
     /// Lucene's classical <see cref="TFIDFSimilarity"/> encodes a combination of index-time boost
     /// and length normalization information with <see cref="Util.SmallSingle"/> into a single byte, this
     /// might not be suitable for all purposes.
     /// <para/>
     /// Many formulas require the use of average document length, which can be computed via a
     /// combination of <see cref="CollectionStatistics.SumTotalTermFreq"/> and
     /// <see cref="CollectionStatistics.MaxDoc"/> or <see cref="CollectionStatistics.DocCount"/>,
     /// depending upon whether the average should reflect field sparsity.
     /// <para/>
     /// Additional scoring factors can be stored in named
     /// <see cref="Documents.NumericDocValuesField"/>s and accessed
     /// at query-time with <see cref="Index.AtomicReader.GetNumericDocValues(string)"/>.
     /// <para/>
     /// Finally, using index-time boosts (either via folding into the normalization byte or
     /// via <see cref="Index.DocValues"/>), is an inefficient way to boost the scores of different fields if the
     /// boost will be the same for every document, instead the Similarity can simply take a constant
     /// boost parameter <i>C</i>, and <see cref="PerFieldSimilarityWrapper"/> can return different
     /// instances with different boosts depending upon field name.
     /// <para/>
     /// <a name="querytime"/>
     /// At query-time, Queries interact with the Similarity via these steps:
     /// <list type="number">
     ///   <item><description>The <see cref="ComputeWeight(float, CollectionStatistics, TermStatistics[])"/> method is called a single time,
     ///       allowing the implementation to compute any statistics (such as IDF, average document length, etc)
     ///       across <i>the entire collection</i>. The <see cref="TermStatistics"/> and <see cref="CollectionStatistics"/> passed in
     ///       already contain all of the raw statistics involved, so a <see cref="Similarity"/> can freely use any combination
     ///       of statistics without causing any additional I/O. Lucene makes no assumption about what is
     ///       stored in the returned <see cref="Similarity.SimWeight"/> object.</description></item>
     ///   <item><description>The query normalization process occurs a single time: <see cref="Similarity.SimWeight.GetValueForNormalization()"/>
     ///       is called for each query leaf node, <see cref="Similarity.QueryNorm(float)"/> is called for the top-level
     ///       query, and finally <see cref="Similarity.SimWeight.Normalize(float, float)"/> passes down the normalization value
     ///       and any top-level boosts (e.g. from enclosing <see cref="BooleanQuery"/>s).</description></item>
     ///   <item><description>For each segment in the index, the <see cref="Query"/> creates a <see cref="GetSimScorer(SimWeight, AtomicReaderContext)"/>
     ///       The GetScore() method is called for each matching document.</description></item>
     /// </list>
     /// <para/>
     /// <a name="explaintime"/>
     /// When <see cref="IndexSearcher.Explain(Lucene.Net.Search.Query, int)"/> is called, queries consult the Similarity's DocScorer for an
     /// explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
     /// was computed.
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     /// <seealso cref="Lucene.Net.Index.IndexWriterConfig.Similarity"/>
     /// <seealso cref="IndexSearcher.Similarity"/>
     public abstract class Similarity
     {
         /// <summary>
         /// Sole constructor. (For invocation by subclass
         /// constructors, typically implicit.)
         /// </summary>
         protected Similarity() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
         {
         }

         /// <summary>
         /// Hook to integrate coordinate-level matching.
         /// <para/>
         /// By default this is disabled (returns <c>1</c>), as with
         /// most modern models this will only skew performance, but some
         /// implementations such as <see cref="TFIDFSimilarity"/> override this.
         /// </summary>
         /// <param name="overlap"> the number of query terms matched in the document </param>
         /// <param name="maxOverlap"> the total number of terms in the query </param>
         /// <returns> a score factor based on term overlap with the query </returns>
         public virtual float Coord(int overlap, int maxOverlap)
         {
             return 1f;
         }

         /// <summary>
         /// Computes the normalization value for a query given the sum of the
         /// normalized weights <see cref="SimWeight.GetValueForNormalization()"/> of
         /// each of the query terms.  this value is passed back to the
         /// weight (<see cref="SimWeight.Normalize(float, float)"/> of each query
         /// term, to provide a hook to attempt to make scores from different
         /// queries comparable.
         /// <para/>
         /// By default this is disabled (returns <c>1</c>), but some
         /// implementations such as <see cref="TFIDFSimilarity"/> override this.
         /// </summary>
         /// <param name="valueForNormalization"> the sum of the term normalization values </param>
         /// <returns> a normalization factor for query weights </returns>
         public virtual float QueryNorm(float valueForNormalization)
         {
             return 1f;
         }

         /// <summary>
         /// Computes the normalization value for a field, given the accumulated
         /// state of term processing for this field (see <see cref="FieldInvertState"/>).
         ///
         /// <para/>Matches in longer fields are less precise, so implementations of this
         /// method usually set smaller values when <c>state.Length</c> is large,
         /// and larger values when <code>state.Length</code> is small.
         /// <para/>
         /// @lucene.experimental
         /// </summary>
         /// <param name="state"> current processing state for this field </param>
         /// <returns> computed norm value </returns>
         public abstract long ComputeNorm(FieldInvertState state);

         /// <summary>
         /// Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring a query.
         /// </summary>
         /// <param name="queryBoost"> the query-time boost. </param>
         /// <param name="collectionStats"> collection-level statistics, such as the number of tokens in the collection. </param>
         /// <param name="termStats"> term-level statistics, such as the document frequency of a term across the collection. </param>
         /// <returns> <see cref="SimWeight"/> object with the information this <see cref="Similarity"/> needs to score a query. </returns>
         public abstract SimWeight ComputeWeight(float queryBoost, CollectionStatistics collectionStats, params TermStatistics[] termStats);

         /// <summary>
         /// Creates a new <see cref="Similarity.SimScorer"/> to score matching documents from a segment of the inverted index. </summary>
         /// <param name="weight"> collection information from <see cref="ComputeWeight(float, CollectionStatistics, TermStatistics[])"/> </param>
         /// <param name="context"> segment of the inverted index to be scored. </param>
         /// <returns> Sloppy <see cref="SimScorer"/> for scoring documents across <c>context</c> </returns>
         /// <exception cref="IOException"> if there is a low-level I/O error </exception>
         public abstract SimScorer GetSimScorer(SimWeight weight, AtomicReaderContext context);

         /// <summary>
         /// API for scoring "sloppy" queries such as <see cref="TermQuery"/>,
         /// <see cref="Spans.SpanQuery"/>, and <see cref="PhraseQuery"/>.
         /// <para/>
         /// Frequencies are floating-point values: an approximate
         /// within-document frequency adjusted for "sloppiness" by
         /// <see cref="SimScorer.ComputeSlopFactor(int)"/>.
         /// </summary>
         public abstract class SimScorer
         {
             /// <summary>
             /// Sole constructor. (For invocation by subclass
             /// constructors, typically implicit.)
             /// </summary>
             protected SimScorer() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
             {
             }

             /// <summary>
             /// Score a single document </summary>
             /// <param name="doc"> document id within the inverted index segment </param>
             /// <param name="freq"> sloppy term frequency </param>
             /// <returns> document's score </returns>
             public abstract float Score(int doc, float freq);

             /// <summary>
             /// Computes the amount of a sloppy phrase match, based on an edit distance. </summary>
             public abstract float ComputeSlopFactor(int distance);

             /// <summary>
             /// Calculate a scoring factor based on the data in the payload. </summary>
             public abstract float ComputePayloadFactor(int doc, int start, int end, BytesRef payload);

             /// <summary>
             /// Explain the score for a single document </summary>
             /// <param name="doc"> document id within the inverted index segment </param>
             /// <param name="freq"> Explanation of how the sloppy term frequency was computed </param>
             /// <returns> document's score </returns>
             public virtual Explanation Explain(int doc, Explanation freq)
             {
                 Explanation result = new Explanation(Score(doc, freq.Value), "score(doc=" + doc + ",freq=" + freq.Value + "), with freq of:");
                 result.AddDetail(freq);
                 return result;
             }
         }

         /// <summary>
         /// Stores the weight for a query across the indexed collection. this abstract
         /// implementation is empty; descendants of <see cref="Similarity"/> should
         /// subclass <see cref="SimWeight"/> and define the statistics they require in the
         /// subclass. Examples include idf, average field length, etc.
         /// </summary>
         public abstract class SimWeight
         {
             /// <summary>
             /// Sole constructor. (For invocation by subclass
             /// constructors, typically implicit.)
             /// </summary>
             protected SimWeight() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
             {
             }

             /// <summary>
             /// The value for normalization of contained query clauses (e.g. sum of squared weights).
             /// <para/>
             /// NOTE: a <see cref="Similarity"/> implementation might not use any query normalization at all,
             /// its not required. However, if it wants to participate in query normalization,
             /// it can return a value here.
             /// </summary>
             public abstract float GetValueForNormalization();

             /// <summary>
             /// Assigns the query normalization factor and boost from parent queries to this.
             /// <para/>
             /// NOTE: a <see cref="Similarity"/> implementation might not use this normalized value at all,
             /// its not required. However, its usually a good idea to at least incorporate
             /// the <paramref name="topLevelBoost"/> (e.g. from an outer <see cref="BooleanQuery"/>) into its score.
             /// </summary>
             public abstract void Normalize(float queryNorm, float topLevelBoost);
         }
     }
 }
	using System.IO;

	namespace Lucene.Net.Search.Similarities
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// javadoc
	using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;

	// javadoc
	using BytesRef = Lucene.Net.Util.BytesRef;
	using FieldInvertState = Lucene.Net.Index.FieldInvertState;

	// javadoc

	/// <summary>
	/// Similarity defines the components of Lucene scoring.
	/// <para/>
	/// Expert: Scoring API.
	/// <para/>
	/// This is a low-level API, you should only extend this API if you want to implement
	/// an information retrieval <i>model</i>. If you are instead looking for a convenient way
	/// to alter Lucene's scoring, consider extending a higher-level implementation
	/// such as <see cref="TFIDFSimilarity"/>, which implements the vector space model with this API, or
	/// just tweaking the default implementation: <see cref="DefaultSimilarity"/>.
	/// <para/>
	/// Similarity determines how Lucene weights terms, and Lucene interacts with
	/// this class at both <a href="#indextime">index-time</a> and
	/// <a href="#querytime">query-time</a>.
	/// <para/>
	/// <a name="indextime"/>
	/// At indexing time, the indexer calls <see cref="ComputeNorm(FieldInvertState)"/>, allowing
	/// the <see cref="Similarity"/> implementation to set a per-document value for the field that will
	/// be later accessible via <see cref="Index.AtomicReader.GetNormValues(string)"/>. Lucene makes no assumption
	/// about what is in this norm, but it is most useful for encoding length normalization
	/// information.
	/// <para/>
	/// Implementations should carefully consider how the normalization is encoded: while
	/// Lucene's classical <see cref="TFIDFSimilarity"/> encodes a combination of index-time boost
	/// and length normalization information with <see cref="Util.SmallSingle"/> into a single byte, this
	/// might not be suitable for all purposes.
	/// <para/>
	/// Many formulas require the use of average document length, which can be computed via a
	/// combination of <see cref="CollectionStatistics.SumTotalTermFreq"/> and
	/// <see cref="CollectionStatistics.MaxDoc"/> or <see cref="CollectionStatistics.DocCount"/>,
	/// depending upon whether the average should reflect field sparsity.
	/// <para/>
	/// Additional scoring factors can be stored in named
	/// <see cref="Documents.NumericDocValuesField"/>s and accessed
	/// at query-time with <see cref="Index.AtomicReader.GetNumericDocValues(string)"/>.
	/// <para/>
	/// Finally, using index-time boosts (either via folding into the normalization byte or
	/// via <see cref="Index.DocValues"/>), is an inefficient way to boost the scores of different fields if the
	/// boost will be the same for every document, instead the Similarity can simply take a constant
	/// boost parameter <i>C</i>, and <see cref="PerFieldSimilarityWrapper"/> can return different
	/// instances with different boosts depending upon field name.
	/// <para/>
	/// <a name="querytime"/>
	/// At query-time, Queries interact with the Similarity via these steps:
	/// <list type="number">
	/// <item><description>The <see cref="ComputeWeight(float, CollectionStatistics, TermStatistics[])"/> method is called a single time,
	/// allowing the implementation to compute any statistics (such as IDF, average document length, etc)
	/// across <i>the entire collection</i>. The <see cref="TermStatistics"/> and <see cref="CollectionStatistics"/> passed in
	/// already contain all of the raw statistics involved, so a <see cref="Similarity"/> can freely use any combination
	/// of statistics without causing any additional I/O. Lucene makes no assumption about what is
	/// stored in the returned <see cref="Similarity.SimWeight"/> object.</description></item>
	/// <item><description>The query normalization process occurs a single time: <see cref="Similarity.SimWeight.GetValueForNormalization()"/>
	/// is called for each query leaf node, <see cref="Similarity.QueryNorm(float)"/> is called for the top-level
	/// query, and finally <see cref="Similarity.SimWeight.Normalize(float, float)"/> passes down the normalization value
	/// and any top-level boosts (e.g. from enclosing <see cref="BooleanQuery"/>s).</description></item>
	/// <item><description>For each segment in the index, the <see cref="Query"/> creates a <see cref="GetSimScorer(SimWeight, AtomicReaderContext)"/>
	/// The GetScore() method is called for each matching document.</description></item>
	/// </list>
	/// <para/>
	/// <a name="explaintime"/>
	/// When <see cref="IndexSearcher.Explain(Lucene.Net.Search.Query, int)"/> is called, queries consult the Similarity's DocScorer for an
	/// explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
	/// was computed.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	/// <seealso cref="Lucene.Net.Index.IndexWriterConfig.Similarity"/>
	/// <seealso cref="IndexSearcher.Similarity"/>
	public abstract class Similarity
	{
	/// <summary>
	/// Sole constructor. (For invocation by subclass
	/// constructors, typically implicit.)
	/// </summary>
	protected Similarity() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
	{
	}

	/// <summary>
	/// Hook to integrate coordinate-level matching.
	/// <para/>
	/// By default this is disabled (returns <c>1</c>), as with
	/// most modern models this will only skew performance, but some
	/// implementations such as <see cref="TFIDFSimilarity"/> override this.
	/// </summary>
	/// <param name="overlap"> the number of query terms matched in the document </param>
	/// <param name="maxOverlap"> the total number of terms in the query </param>
	/// <returns> a score factor based on term overlap with the query </returns>
	public virtual float Coord(int overlap, int maxOverlap)
	{
	return 1f;
	}

	/// <summary>
	/// Computes the normalization value for a query given the sum of the
	/// normalized weights <see cref="SimWeight.GetValueForNormalization()"/> of
	/// each of the query terms. this value is passed back to the
	/// weight (<see cref="SimWeight.Normalize(float, float)"/> of each query
	/// term, to provide a hook to attempt to make scores from different
	/// queries comparable.
	/// <para/>
	/// By default this is disabled (returns <c>1</c>), but some
	/// implementations such as <see cref="TFIDFSimilarity"/> override this.
	/// </summary>
	/// <param name="valueForNormalization"> the sum of the term normalization values </param>
	/// <returns> a normalization factor for query weights </returns>
	public virtual float QueryNorm(float valueForNormalization)
	{
	return 1f;
	}

	/// <summary>
	/// Computes the normalization value for a field, given the accumulated
	/// state of term processing for this field (see <see cref="FieldInvertState"/>).
	///
	/// <para/>Matches in longer fields are less precise, so implementations of this
	/// method usually set smaller values when <c>state.Length</c> is large,
	/// and larger values when <code>state.Length</code> is small.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	/// <param name="state"> current processing state for this field </param>
	/// <returns> computed norm value </returns>
	public abstract long ComputeNorm(FieldInvertState state);

	/// <summary>
	/// Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring a query.
	/// </summary>
	/// <param name="queryBoost"> the query-time boost. </param>
	/// <param name="collectionStats"> collection-level statistics, such as the number of tokens in the collection. </param>
	/// <param name="termStats"> term-level statistics, such as the document frequency of a term across the collection. </param>
	/// <returns> <see cref="SimWeight"/> object with the information this <see cref="Similarity"/> needs to score a query. </returns>
	public abstract SimWeight ComputeWeight(float queryBoost, CollectionStatistics collectionStats, params TermStatistics[] termStats);

	/// <summary>
	/// Creates a new <see cref="Similarity.SimScorer"/> to score matching documents from a segment of the inverted index. </summary>
	/// <param name="weight"> collection information from <see cref="ComputeWeight(float, CollectionStatistics, TermStatistics[])"/> </param>
	/// <param name="context"> segment of the inverted index to be scored. </param>
	/// <returns> Sloppy <see cref="SimScorer"/> for scoring documents across <c>context</c> </returns>
	/// <exception cref="IOException"> if there is a low-level I/O error </exception>
	public abstract SimScorer GetSimScorer(SimWeight weight, AtomicReaderContext context);

	/// <summary>
	/// API for scoring "sloppy" queries such as <see cref="TermQuery"/>,
	/// <see cref="Spans.SpanQuery"/>, and <see cref="PhraseQuery"/>.
	/// <para/>
	/// Frequencies are floating-point values: an approximate
	/// within-document frequency adjusted for "sloppiness" by
	/// <see cref="SimScorer.ComputeSlopFactor(int)"/>.
	/// </summary>
	public abstract class SimScorer
	{
	/// <summary>
	/// Sole constructor. (For invocation by subclass
	/// constructors, typically implicit.)
	/// </summary>
	protected SimScorer() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
	{
	}

	/// <summary>
	/// Score a single document </summary>
	/// <param name="doc"> document id within the inverted index segment </param>
	/// <param name="freq"> sloppy term frequency </param>
	/// <returns> document's score </returns>
	public abstract float Score(int doc, float freq);

	/// <summary>
	/// Computes the amount of a sloppy phrase match, based on an edit distance. </summary>
	public abstract float ComputeSlopFactor(int distance);

	/// <summary>
	/// Calculate a scoring factor based on the data in the payload. </summary>
	public abstract float ComputePayloadFactor(int doc, int start, int end, BytesRef payload);

	/// <summary>
	/// Explain the score for a single document </summary>
	/// <param name="doc"> document id within the inverted index segment </param>
	/// <param name="freq"> Explanation of how the sloppy term frequency was computed </param>
	/// <returns> document's score </returns>
	public virtual Explanation Explain(int doc, Explanation freq)
	{
	Explanation result = new Explanation(Score(doc, freq.Value), "score(doc=" + doc + ",freq=" + freq.Value + "), with freq of:");
	result.AddDetail(freq);
	return result;
	}
	}

	/// <summary>
	/// Stores the weight for a query across the indexed collection. this abstract
	/// implementation is empty; descendants of <see cref="Similarity"/> should
	/// subclass <see cref="SimWeight"/> and define the statistics they require in the
	/// subclass. Examples include idf, average field length, etc.
	/// </summary>
	public abstract class SimWeight
	{
	/// <summary>
	/// Sole constructor. (For invocation by subclass
	/// constructors, typically implicit.)
	/// </summary>
	protected SimWeight() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
	{
	}

	/// <summary>
	/// The value for normalization of contained query clauses (e.g. sum of squared weights).
	/// <para/>
	/// NOTE: a <see cref="Similarity"/> implementation might not use any query normalization at all,
	/// its not required. However, if it wants to participate in query normalization,
	/// it can return a value here.
	/// </summary>
	public abstract float GetValueForNormalization();

	/// <summary>
	/// Assigns the query normalization factor and boost from parent queries to this.
	/// <para/>
	/// NOTE: a <see cref="Similarity"/> implementation might not use this normalized value at all,
	/// its not required. However, its usually a good idea to at least incorporate
	/// the <paramref name="topLevelBoost"/> (e.g. from an outer <see cref="BooleanQuery"/>) into its score.
	/// </summary>
	public abstract void Normalize(float queryNorm, float topLevelBoost);
	}
	}
	}