| using System; |
| |
| namespace Lucene.Net.Search.Similarities |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Implements the <em>divergence from randomness (DFR)</em> framework |
| /// introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002. |
| /// Probabilistic models of information retrieval based on measuring the |
| /// divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002), |
| /// 357-389. |
| /// <para>The DFR scoring formula is composed of three separate components: the |
| /// <em>basic model</em>, the <em>aftereffect</em> and an additional |
| /// <em>normalization</em> component, represented by the classes |
| /// <see cref="Similarities.BasicModel"/>, <see cref="Similarities.AfterEffect"/> and <see cref="Similarities.Normalization"/>, |
| /// respectively. The names of these classes were chosen to match the names of |
| /// their counterparts in the Terrier IR engine.</para> |
| /// <para>To construct a <see cref="DFRSimilarity"/>, you must specify the implementations for |
| /// all three components of DFR: |
| /// <list type="table"> |
| /// <listheader> |
| /// <term>Component</term> |
| /// <term>Implementations</term> |
| /// </listheader> |
| /// <item> |
| /// <term><see cref="Similarities.BasicModel"/>: Basic model of information content:</term> |
| /// <term> |
| /// <list type="bullet"> |
| /// <item><description><see cref="BasicModelBE"/>: Limiting form of Bose-Einstein</description></item> |
| /// <item><description><see cref="BasicModelG"/>: Geometric approximation of Bose-Einstein</description></item> |
| /// <item><description><see cref="BasicModelP"/>: Poisson approximation of the Binomial</description></item> |
| /// <item><description><see cref="BasicModelD"/>: Divergence approximation of the Binomial</description></item> |
| /// <item><description><see cref="BasicModelIn"/>: Inverse document frequency</description></item> |
| /// <item><description><see cref="BasicModelIne"/>: Inverse expected document frequency [mixture of Poisson and IDF]</description></item> |
| /// <item><description><see cref="BasicModelIF"/>: Inverse term frequency [approximation of I(ne)]</description></item> |
| /// </list> |
| /// </term> |
| /// </item> |
| /// <item> |
| /// <term><see cref="Similarities.AfterEffect"/>: First normalization of information gain:</term> |
| /// <term> |
| /// <list type="bullet"> |
| /// <item><description><see cref="AfterEffectL"/>: Laplace's law of succession</description></item> |
| /// <item><description><see cref="AfterEffectB"/>: Ratio of two Bernoulli processes</description></item> |
| /// <item><description><see cref="AfterEffect.NoAfterEffect"/>: no first normalization</description></item> |
| /// </list> |
| /// </term> |
| /// </item> |
| /// <item> |
| /// <term><see cref="Similarities.Normalization"/>: Second (length) normalization:</term> |
| /// <term> |
| /// <list type="bullet"> |
| /// <item><description><see cref="NormalizationH1"/>: Uniform distribution of term frequency</description></item> |
| /// <item><description><see cref="NormalizationH2"/>: term frequency density inversely related to length</description></item> |
| /// <item><description><see cref="NormalizationH3"/>: term frequency normalization provided by Dirichlet prior</description></item> |
| /// <item><description><see cref="NormalizationZ"/>: term frequency normalization provided by a Zipfian relation</description></item> |
| /// <item><description><see cref="Normalization.NoNormalization"/>: no second normalization</description></item> |
| /// </list> |
| /// </term> |
| /// </item> |
| /// </list> |
| /// |
| /// </para> |
| /// <para>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query, |
| /// is not handled by this implementation. |
| /// </para> |
| /// @lucene.experimental |
| /// </summary> |
| /// <seealso cref="Similarities.BasicModel"/> |
| /// <seealso cref="Similarities.AfterEffect"/> |
| /// <seealso cref="Similarities.Normalization"/> |
| public class DFRSimilarity : SimilarityBase |
| { |
| /// <summary> |
| /// The basic model for information content. </summary> |
| protected internal readonly BasicModel m_basicModel; |
| |
| /// <summary> |
| /// The first normalization of the information content. </summary> |
| protected internal readonly AfterEffect m_afterEffect; |
| |
| /// <summary> |
| /// The term frequency normalization. </summary> |
| protected internal readonly Normalization m_normalization; |
| |
| /// <summary> |
| /// Creates DFRSimilarity from the three components. |
| /// <para/> |
| /// Note that <c>null</c> values are not allowed: |
| /// if you want no normalization or after-effect, instead pass |
| /// <see cref="Normalization.NoNormalization"/> or <see cref="AfterEffect.NoAfterEffect"/> respectively. </summary> |
| /// <param name="basicModel"> Basic model of information content </param> |
| /// <param name="afterEffect"> First normalization of information gain </param> |
| /// <param name="normalization"> Second (length) normalization </param> |
| /// <exception cref="ArgumentNullException"><paramref name="basicModel"/>, <paramref name="afterEffect"/>, |
| /// or <paramref name="normalization"/> is <c>null</c>.</exception> |
| public DFRSimilarity(BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) |
| { |
| // LUCENENET: Changed guard clauses from NullPointerException to ArgumentNullException |
| this.m_basicModel = basicModel ?? throw new ArgumentNullException(nameof(basicModel)); |
| this.m_afterEffect = afterEffect ?? throw new ArgumentNullException(nameof(afterEffect)); |
| this.m_normalization = normalization ?? throw new ArgumentNullException(nameof(normalization)); |
| } |
| |
| public override float Score(BasicStats stats, float freq, float docLen) |
| { |
| float tfn = m_normalization.Tfn(stats, freq, docLen); |
| return stats.TotalBoost * m_basicModel.Score(stats, tfn) * m_afterEffect.Score(stats, tfn); |
| } |
| |
| protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen) |
| { |
| if (stats.TotalBoost != 1.0f) |
| { |
| expl.AddDetail(new Explanation(stats.TotalBoost, "boost")); |
| } |
| |
| Explanation normExpl = m_normalization.Explain(stats, freq, docLen); |
| float tfn = normExpl.Value; |
| expl.AddDetail(normExpl); |
| expl.AddDetail(m_basicModel.Explain(stats, tfn)); |
| expl.AddDetail(m_afterEffect.Explain(stats, tfn)); |
| } |
| |
| public override string ToString() |
| { |
| return "DFR " + m_basicModel.ToString() + m_afterEffect.ToString() + m_normalization.ToString(); |
| } |
| |
| /// <summary> |
| /// Returns the basic model of information content |
| /// </summary> |
| public virtual BasicModel BasicModel => m_basicModel; |
| |
| /// <summary> |
| /// Returns the first normalization |
| /// </summary> |
| public virtual AfterEffect AfterEffect => m_afterEffect; |
| |
| /// <summary> |
| /// Returns the second normalization |
| /// </summary> |
| public virtual Normalization Normalization => m_normalization; |
| } |
| } |