lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.similarities;


 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.similarities.Normalization.NoNormalization;

 /**
  * Implements the <em>divergence from randomness (DFR)</em> framework
  * introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002.
  * Probabilistic models of information retrieval based on measuring the
  * divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002),
  * 357-389.
  * <p>The DFR scoring formula is composed of three separate components: the
  * <em>basic model</em>, the <em>aftereffect</em> and an additional
  * <em>normalization</em> component, represented by the classes
  * {@code BasicModel}, {@code AfterEffect} and {@code Normalization},
  * respectively. The names of these classes were chosen to match the names of
  * their counterparts in the Terrier IR engine.</p>
  * <p>To construct a DFRSimilarity, you must specify the implementations for
  * all three components of DFR:
  * <ol>
  *    <li>{@link BasicModel}: Basic model of information content:
  *        <ul>
  *           <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein
  *           <li>{@link BasicModelIn}: Inverse document frequency
  *           <li>{@link BasicModelIne}: Inverse expected document
  *               frequency [mixture of Poisson and IDF]
  *           <li>{@link BasicModelIF}: Inverse term frequency
  *               [approximation of I(ne)]
  *        </ul>
  *    <li>{@link AfterEffect}: First normalization of information
  *        gain:
  *        <ul>
  *           <li>{@link AfterEffectL}: Laplace's law of succession
  *           <li>{@link AfterEffectB}: Ratio of two Bernoulli processes
  *        </ul>
  *    <li>{@link Normalization}: Second (length) normalization:
  *        <ul>
  *           <li>{@link NormalizationH1}: Uniform distribution of term
  *               frequency
  *           <li>{@link NormalizationH2}: term frequency density inversely
  *               related to length
  *           <li>{@link NormalizationH3}: term frequency normalization
  *               provided by Dirichlet prior
  *           <li>{@link NormalizationZ}: term frequency normalization provided
  *                by a Zipfian relation
  *           <li>{@link NoNormalization}: no second normalization
  *        </ul>
  * </ol>
  * <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
  * is not handled by this implementation.</p>
  * <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson
  * approximation of the Binomial) and D (Divergence approximation of the
  * Binomial) are not implemented because their formula couldn't be written in
  * a way that makes scores non-decreasing with the normalized term frequency.
  * @see BasicModel
  * @see AfterEffect
  * @see Normalization
  * @lucene.experimental
  */
 public class DFRSimilarity extends SimilarityBase {
   /** The basic model for information content. */
   protected final BasicModel basicModel;
   /** The first normalization of the information content. */
   protected final AfterEffect afterEffect;
   /** The term frequency normalization. */
   protected final Normalization normalization;

   /**
    * Creates DFRSimilarity from the three components.
    * <p>
    * Note that <code>null</code> values are not allowed:
    * if you want no normalization, instead pass
    * {@link NoNormalization}.
    * @param basicModel Basic model of information content
    * @param afterEffect First normalization of information gain
    * @param normalization Second (length) normalization
    */
   public DFRSimilarity(BasicModel basicModel,
                        AfterEffect afterEffect,
                        Normalization normalization) {
     if (basicModel == null || afterEffect == null || normalization == null) {
       throw new NullPointerException("null parameters not allowed.");
     }
     this.basicModel = basicModel;
     this.afterEffect = afterEffect;
     this.normalization = normalization;
   }

   @Override
   protected double score(BasicStats stats, double freq, double docLen) {
     double tfn = normalization.tfn(stats, freq, docLen);
     double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
     return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
   }

   @Override
   protected void explain(List<Explanation> subs,
       BasicStats stats, double freq, double docLen) {
     if (stats.getBoost() != 1.0d) {
       subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost"));
     }

     Explanation normExpl = normalization.explain(stats, freq, docLen);
     double tfn = normalization.tfn(stats, freq, docLen);
     double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
     subs.add(normExpl);
     subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn));
     subs.add(afterEffect.explain(stats, tfn));
   }

   @Override
   protected Explanation explain(
       BasicStats stats, Explanation freq, double docLen) {
     List<Explanation> subs = new ArrayList<>();
     explain(subs, stats, freq.getValue().doubleValue(), docLen);

     return Explanation.match(
         (float) score(stats, freq.getValue().doubleValue(), docLen),
         "score(" + getClass().getSimpleName() + ", freq=" +
             freq.getValue() +"), computed as boost * " +
             "basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:",
         subs);
   }

   @Override
   public String toString() {
     return "DFR " + basicModel.toString() + afterEffect.toString()
                   + normalization.toString();
   }

   /**
    * Returns the basic model of information content
    */
   public BasicModel getBasicModel() {
     return basicModel;
   }

   /**
    * Returns the first normalization
    */
   public AfterEffect getAfterEffect() {
     return afterEffect;
   }

   /**
    * Returns the second normalization
    */
   public Normalization getNormalization() {
     return normalization;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.similarities;


	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.search.Explanation;
	import org.apache.lucene.search.similarities.Normalization.NoNormalization;

	/**
	* Implements the <em>divergence from randomness (DFR)</em> framework
	* introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002.
	* Probabilistic models of information retrieval based on measuring the
	* divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002),
	* 357-389.
	* <p>The DFR scoring formula is composed of three separate components: the
	* <em>basic model</em>, the <em>aftereffect</em> and an additional
	* <em>normalization</em> component, represented by the classes
	* {@code BasicModel}, {@code AfterEffect} and {@code Normalization},
	* respectively. The names of these classes were chosen to match the names of
	* their counterparts in the Terrier IR engine.</p>
	* <p>To construct a DFRSimilarity, you must specify the implementations for
	* all three components of DFR:
	* <ol>
	* <li>{@link BasicModel}: Basic model of information content:
	* <ul>
	* <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein
	* <li>{@link BasicModelIn}: Inverse document frequency
	* <li>{@link BasicModelIne}: Inverse expected document
	* frequency [mixture of Poisson and IDF]
	* <li>{@link BasicModelIF}: Inverse term frequency
	* [approximation of I(ne)]
	* </ul>
	* <li>{@link AfterEffect}: First normalization of information
	* gain:
	* <ul>
	* <li>{@link AfterEffectL}: Laplace's law of succession
	* <li>{@link AfterEffectB}: Ratio of two Bernoulli processes
	* </ul>
	* <li>{@link Normalization}: Second (length) normalization:
	* <ul>
	* <li>{@link NormalizationH1}: Uniform distribution of term
	* frequency
	* <li>{@link NormalizationH2}: term frequency density inversely
	* related to length
	* <li>{@link NormalizationH3}: term frequency normalization
	* provided by Dirichlet prior
	* <li>{@link NormalizationZ}: term frequency normalization provided
	* by a Zipfian relation
	* <li>{@link NoNormalization}: no second normalization
	* </ul>
	* </ol>
	* <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
	* is not handled by this implementation.</p>
	* <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson
	* approximation of the Binomial) and D (Divergence approximation of the
	* Binomial) are not implemented because their formula couldn't be written in
	* a way that makes scores non-decreasing with the normalized term frequency.
	* @see BasicModel
	* @see AfterEffect
	* @see Normalization
	* @lucene.experimental
	*/
	public class DFRSimilarity extends SimilarityBase {
	/** The basic model for information content. */
	protected final BasicModel basicModel;
	/** The first normalization of the information content. */
	protected final AfterEffect afterEffect;
	/** The term frequency normalization. */
	protected final Normalization normalization;

	/**
	* Creates DFRSimilarity from the three components.
	* <p>
	* Note that <code>null</code> values are not allowed:
	* if you want no normalization, instead pass
	* {@link NoNormalization}.
	* @param basicModel Basic model of information content
	* @param afterEffect First normalization of information gain
	* @param normalization Second (length) normalization
	*/
	public DFRSimilarity(BasicModel basicModel,
	AfterEffect afterEffect,
	Normalization normalization) {
	if (basicModel == null \|\| afterEffect == null \|\| normalization == null) {
	throw new NullPointerException("null parameters not allowed.");
	}
	this.basicModel = basicModel;
	this.afterEffect = afterEffect;
	this.normalization = normalization;
	}

	@Override
	protected double score(BasicStats stats, double freq, double docLen) {
	double tfn = normalization.tfn(stats, freq, docLen);
	double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
	return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
	}

	@Override
	protected void explain(List<Explanation> subs,
	BasicStats stats, double freq, double docLen) {
	if (stats.getBoost() != 1.0d) {
	subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost"));
	}

	Explanation normExpl = normalization.explain(stats, freq, docLen);
	double tfn = normalization.tfn(stats, freq, docLen);
	double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats);
	subs.add(normExpl);
	subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn));
	subs.add(afterEffect.explain(stats, tfn));
	}

	@Override
	protected Explanation explain(
	BasicStats stats, Explanation freq, double docLen) {
	List<Explanation> subs = new ArrayList<>();
	explain(subs, stats, freq.getValue().doubleValue(), docLen);

	return Explanation.match(
	(float) score(stats, freq.getValue().doubleValue(), docLen),
	"score(" + getClass().getSimpleName() + ", freq=" +
	freq.getValue() +"), computed as boost * " +
	"basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:",
	subs);
	}

	@Override
	public String toString() {
	return "DFR " + basicModel.toString() + afterEffect.toString()
	+ normalization.toString();
	}

	/**
	* Returns the basic model of information content
	*/
	public BasicModel getBasicModel() {
	return basicModel;
	}

	/**
	* Returns the first normalization
	*/
	public AfterEffect getAfterEffect() {
	return afterEffect;
	}

	/**
	* Returns the second normalization
	*/
	public Normalization getNormalization() {
	return normalization;
	}
	}