blob: 2acc7b535abed211410bf5fcbe65109a2e19a839 [file] [log] [blame]
using System.Globalization;
namespace Lucene.Net.Search.Similarities
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Abstract superclass for language modeling Similarities. The following inner
/// types are introduced:
/// <list type="bullet">
/// <item><description><see cref="LMStats"/>, which defines a new statistic, the probability that
/// the collection language model generates the current term;</description></item>
/// <item><description><see cref="ICollectionModel"/>, which is a strategy interface for object that
/// compute the collection language model <c>p(w|C)</c>;</description></item>
/// <item><description><see cref="DefaultCollectionModel"/>, an implementation of the former, that
/// computes the term probability as the number of occurrences of the term in the
/// collection, divided by the total number of tokens.</description></item>
/// </list>
/// <para/>
/// @lucene.experimental
/// </summary>
public abstract class LMSimilarity : SimilarityBase
{
/// <summary>
/// The collection model. </summary>
protected readonly ICollectionModel m_collectionModel;
/// <summary>
/// Creates a new instance with the specified collection language model. </summary>
protected LMSimilarity(ICollectionModel collectionModel) // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
{
this.m_collectionModel = collectionModel;
}
/// <summary>
/// Creates a new instance with the default collection language model. </summary>
protected LMSimilarity() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
: this(new DefaultCollectionModel())
{
}
protected internal override BasicStats NewStats(string field, float queryBoost)
{
return new LMStats(field, queryBoost);
}
/// <summary>
/// Computes the collection probability of the current term in addition to the
/// usual statistics.
/// </summary>
protected internal override void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
{
base.FillBasicStats(stats, collectionStats, termStats);
LMStats lmStats = (LMStats)stats;
lmStats.CollectionProbability = m_collectionModel.ComputeProbability(stats);
}
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen)
{
expl.AddDetail(new Explanation(m_collectionModel.ComputeProbability(stats), "collection probability"));
}
/// <summary>
/// Returns the name of the LM method. The values of the parameters should be
/// included as well.
/// <para>Used in <see cref="ToString()"/></para>.
/// </summary>
public abstract string GetName();
/// <summary>
/// Returns the name of the LM method. If a custom collection model strategy is
/// used, its name is included as well. </summary>
/// <seealso cref="GetName()"/>
/// <seealso cref="ICollectionModel.GetName()"/>
/// <seealso cref="DefaultCollectionModel"/>
public override string ToString()
{
string coll = m_collectionModel.GetName();
if (coll != null)
{
return string.Format("LM {0} - {1}", GetName(), coll);
}
else
{
return string.Format("LM {0}", GetName());
}
}
/// <summary>
/// Stores the collection distribution of the current term. </summary>
public class LMStats : BasicStats
{
/// <summary>
/// The probability that the current term is generated by the collection. </summary>
private float collectionProbability;
/// <summary>
/// Creates <see cref="LMStats"/> for the provided field and query-time boost
/// </summary>
public LMStats(string field, float queryBoost)
: base(field, queryBoost)
{
}
/// <summary>
/// Returns the probability that the current term is generated by the
/// collection.
/// </summary>
public float CollectionProbability
{
get => collectionProbability;
set => this.collectionProbability = value;
}
}
/// <summary>
/// A strategy for computing the collection language model. </summary>
public interface ICollectionModel
{
/// <summary>
/// Computes the probability <c>p(w|C)</c> according to the language model
/// strategy for the current term.
/// </summary>
float ComputeProbability(BasicStats stats);
/// <summary>
/// The name of the collection model strategy. </summary>
string GetName();
}
/// <summary>
/// Models <c>p(w|C)</c> as the number of occurrences of the term in the
/// collection, divided by the total number of tokens <c>+ 1</c>.
/// </summary>
public class DefaultCollectionModel : ICollectionModel
{
/// <summary>
/// Sole constructor: parameter-free </summary>
public DefaultCollectionModel()
{
}
public virtual float ComputeProbability(BasicStats stats)
{
return (stats.TotalTermFreq + 1F) / (stats.NumberOfFieldTokens + 1F);
}
public virtual string GetName()
{
return null;
}
}
}
}