src/Lucene.Net/Search/Similarities/DefaultSimilarity.cs - lucenenet - Git at Google

 using System;

 namespace Lucene.Net.Search.Similarities
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     using BytesRef = Lucene.Net.Util.BytesRef;
     using FieldInvertState = Lucene.Net.Index.FieldInvertState;
     using SmallSingle = Lucene.Net.Util.SmallSingle;

     /// <summary>
     /// Expert: Default scoring implementation which encodes (<see cref="EncodeNormValue(float)"/>)
     /// norm values as a single byte before being stored. At search time,
     /// the norm byte value is read from the index
     /// <see cref="Lucene.Net.Store.Directory"/> and
     /// decoded (<see cref="DecodeNormValue(long)"/>) back to a float <i>norm</i> value.
     /// this encoding/decoding, while reducing index size, comes with the price of
     /// precision loss - it is not guaranteed that <i>Decode(Encode(x)) = x</i>. For
     /// instance, <i>Decode(Encode(0.89)) = 0.75</i>.
     /// <para/>
     /// Compression of norm values to a single byte saves memory at search time,
     /// because once a field is referenced at search time, its norms - for all
     /// documents - are maintained in memory.
     /// <para/>
     /// The rationale supporting such lossy compression of norm values is that given
     /// the difficulty (and inaccuracy) of users to express their true information
     /// need by a query, only big differences matter.
     /// <para/>
     /// Last, note that search time is too late to modify this <i>norm</i> part of
     /// scoring, e.g. by using a different <see cref="Similarity"/> for search.
     /// </summary>
     public class DefaultSimilarity : TFIDFSimilarity
     {
         /// <summary>
         /// Cache of decoded bytes. </summary>
         private static readonly float[] NORM_TABLE = LoadNormTable();

         private static float[] LoadNormTable() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
         {
             float[] normTable = new float[256];
             for (int i = 0; i < 256; i++)
             {
                 normTable[i] = SmallSingle.SByte315ToSingle((sbyte)i);
             }
             return normTable;
         }

         /// <summary>
         /// Sole constructor: parameter-free </summary>
         public DefaultSimilarity()
         {
         }

         /// <summary>
         /// Implemented as <c>overlap / maxOverlap</c>. </summary>
         public override float Coord(int overlap, int maxOverlap)
         {
             return overlap / (float)maxOverlap;
         }

         /// <summary>
         /// Implemented as <c>1/sqrt(sumOfSquaredWeights)</c>. </summary>
         public override float QueryNorm(float sumOfSquaredWeights)
         {
             return (float)(1.0 / Math.Sqrt(sumOfSquaredWeights));
         }

         /// <summary>
         /// Encodes a normalization factor for storage in an index.
         /// <para/>
         /// The encoding uses a three-bit mantissa, a five-bit exponent, and the
         /// zero-exponent point at 15, thus representing values from around 7x10^9 to
         /// 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
         /// represented. Negative numbers are rounded up to zero. Values too large to
         /// represent are rounded down to the largest representable value. Positive
         /// values too small to represent are rounded up to the smallest positive
         /// representable value.
         /// </summary>
         /// <seealso cref="Lucene.Net.Documents.Field.Boost"/>
         /// <seealso cref="Lucene.Net.Util.SmallSingle"/>
         public override sealed long EncodeNormValue(float f)
         {
             return SmallSingle.SingleToSByte315(f);
         }

         /// <summary>
         /// Decodes the norm value, assuming it is a single byte.
         /// </summary>
         /// <seealso cref="EncodeNormValue(float)"/>
         public override sealed float DecodeNormValue(long norm)
         {
             return NORM_TABLE[(int)(norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
         }

         /// <summary>
         /// Implemented as
         /// <c>state.Boost * LengthNorm(numTerms)</c>, where
         /// <c>numTerms</c> is <see cref="FieldInvertState.Length"/> if
         /// <see cref="DiscountOverlaps"/> is <c>false</c>, else it's
         /// <see cref="FieldInvertState.Length"/> -
         /// <see cref="FieldInvertState.NumOverlap"/>.
         ///
         /// <para/>
         /// @lucene.experimental
         /// </summary>
         public override float LengthNorm(FieldInvertState state)
         {
             int numTerms;
             if (m_discountOverlaps)
             {
                 numTerms = state.Length - state.NumOverlap;
             }
             else
             {
                 numTerms = state.Length;
             }
             return state.Boost * ((float)(1.0 / Math.Sqrt(numTerms)));
         }

         /// <summary>
         /// Implemented as <c>Math.Sqrt(freq)</c>. </summary>
         public override float Tf(float freq)
         {
             return (float)Math.Sqrt(freq);
         }

         /// <summary>
         /// Implemented as <c>1 / (distance + 1)</c>. </summary>
         public override float SloppyFreq(int distance)
         {
             return 1.0f / (distance + 1);
         }

         /// <summary>
         /// The default implementation returns <c>1</c> </summary>
         public override float ScorePayload(int doc, int start, int end, BytesRef payload)
         {
             return 1;
         }

         /// <summary>
         /// Implemented as <c>log(numDocs/(docFreq+1)) + 1</c>. </summary>
         public override float Idf(long docFreq, long numDocs)
         {
             return (float)(Math.Log(numDocs / (double)(docFreq + 1)) + 1.0);
         }

         /// <summary>
         /// <c>True</c> if overlap tokens (tokens with a position of increment of zero) are
         /// discounted from the document's length.
         /// </summary>
         protected bool m_discountOverlaps = true;

         /// <summary>
         /// Determines whether overlap tokens (Tokens with
         /// 0 position increment) are ignored when computing
         /// norm.  By default this is true, meaning overlap
         /// tokens do not count when computing norms.
         /// <para/>
         /// @lucene.experimental
         /// </summary>
         /// <seealso cref="TFIDFSimilarity.ComputeNorm(FieldInvertState)"/>
         public virtual bool DiscountOverlaps
         {
             get => m_discountOverlaps;
             set => m_discountOverlaps = value;
         }

         public override string ToString()
         {
             return "DefaultSimilarity";
         }
     }
 }
	using System;

	namespace Lucene.Net.Search.Similarities
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using BytesRef = Lucene.Net.Util.BytesRef;
	using FieldInvertState = Lucene.Net.Index.FieldInvertState;
	using SmallSingle = Lucene.Net.Util.SmallSingle;

	/// <summary>
	/// Expert: Default scoring implementation which encodes (<see cref="EncodeNormValue(float)"/>)
	/// norm values as a single byte before being stored. At search time,
	/// the norm byte value is read from the index
	/// <see cref="Lucene.Net.Store.Directory"/> and
	/// decoded (<see cref="DecodeNormValue(long)"/>) back to a float <i>norm</i> value.
	/// this encoding/decoding, while reducing index size, comes with the price of
	/// precision loss - it is not guaranteed that <i>Decode(Encode(x)) = x</i>. For
	/// instance, <i>Decode(Encode(0.89)) = 0.75</i>.
	/// <para/>
	/// Compression of norm values to a single byte saves memory at search time,
	/// because once a field is referenced at search time, its norms - for all
	/// documents - are maintained in memory.
	/// <para/>
	/// The rationale supporting such lossy compression of norm values is that given
	/// the difficulty (and inaccuracy) of users to express their true information
	/// need by a query, only big differences matter.
	/// <para/>
	/// Last, note that search time is too late to modify this <i>norm</i> part of
	/// scoring, e.g. by using a different <see cref="Similarity"/> for search.
	/// </summary>
	public class DefaultSimilarity : TFIDFSimilarity
	{
	/// <summary>
	/// Cache of decoded bytes. </summary>
	private static readonly float[] NORM_TABLE = LoadNormTable();

	private static float[] LoadNormTable() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
	{
	float[] normTable = new float[256];
	for (int i = 0; i < 256; i++)
	{
	normTable[i] = SmallSingle.SByte315ToSingle((sbyte)i);
	}
	return normTable;
	}

	/// <summary>
	/// Sole constructor: parameter-free </summary>
	public DefaultSimilarity()
	{
	}

	/// <summary>
	/// Implemented as <c>overlap / maxOverlap</c>. </summary>
	public override float Coord(int overlap, int maxOverlap)
	{
	return overlap / (float)maxOverlap;
	}

	/// <summary>
	/// Implemented as <c>1/sqrt(sumOfSquaredWeights)</c>. </summary>
	public override float QueryNorm(float sumOfSquaredWeights)
	{
	return (float)(1.0 / Math.Sqrt(sumOfSquaredWeights));
	}

	/// <summary>
	/// Encodes a normalization factor for storage in an index.
	/// <para/>
	/// The encoding uses a three-bit mantissa, a five-bit exponent, and the
	/// zero-exponent point at 15, thus representing values from around 7x10^9 to
	/// 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
	/// represented. Negative numbers are rounded up to zero. Values too large to
	/// represent are rounded down to the largest representable value. Positive
	/// values too small to represent are rounded up to the smallest positive
	/// representable value.
	/// </summary>
	/// <seealso cref="Lucene.Net.Documents.Field.Boost"/>
	/// <seealso cref="Lucene.Net.Util.SmallSingle"/>
	public override sealed long EncodeNormValue(float f)
	{
	return SmallSingle.SingleToSByte315(f);
	}

	/// <summary>
	/// Decodes the norm value, assuming it is a single byte.
	/// </summary>
	/// <seealso cref="EncodeNormValue(float)"/>
	public override sealed float DecodeNormValue(long norm)
	{
	return NORM_TABLE[(int)(norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
	}

	/// <summary>
	/// Implemented as
	/// <c>state.Boost * LengthNorm(numTerms)</c>, where
	/// <c>numTerms</c> is <see cref="FieldInvertState.Length"/> if
	/// <see cref="DiscountOverlaps"/> is <c>false</c>, else it's
	/// <see cref="FieldInvertState.Length"/> -
	/// <see cref="FieldInvertState.NumOverlap"/>.
	///
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	public override float LengthNorm(FieldInvertState state)
	{
	int numTerms;
	if (m_discountOverlaps)
	{
	numTerms = state.Length - state.NumOverlap;
	}
	else
	{
	numTerms = state.Length;
	}
	return state.Boost * ((float)(1.0 / Math.Sqrt(numTerms)));
	}

	/// <summary>
	/// Implemented as <c>Math.Sqrt(freq)</c>. </summary>
	public override float Tf(float freq)
	{
	return (float)Math.Sqrt(freq);
	}

	/// <summary>
	/// Implemented as <c>1 / (distance + 1)</c>. </summary>
	public override float SloppyFreq(int distance)
	{
	return 1.0f / (distance + 1);
	}

	/// <summary>
	/// The default implementation returns <c>1</c> </summary>
	public override float ScorePayload(int doc, int start, int end, BytesRef payload)
	{
	return 1;
	}

	/// <summary>
	/// Implemented as <c>log(numDocs/(docFreq+1)) + 1</c>. </summary>
	public override float Idf(long docFreq, long numDocs)
	{
	return (float)(Math.Log(numDocs / (double)(docFreq + 1)) + 1.0);
	}

	/// <summary>
	/// <c>True</c> if overlap tokens (tokens with a position of increment of zero) are
	/// discounted from the document's length.
	/// </summary>
	protected bool m_discountOverlaps = true;

	/// <summary>
	/// Determines whether overlap tokens (Tokens with
	/// 0 position increment) are ignored when computing
	/// norm. By default this is true, meaning overlap
	/// tokens do not count when computing norms.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	/// <seealso cref="TFIDFSimilarity.ComputeNorm(FieldInvertState)"/>
	public virtual bool DiscountOverlaps
	{
	get => m_discountOverlaps;
	set => m_discountOverlaps = value;
	}

	public override string ToString()
	{
	return "DefaultSimilarity";
	}
	}
	}