| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.similarities; |
| |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.search.Explanation; |
| import org.apache.lucene.search.similarities.Normalization.NoNormalization; |
| |
| /** |
| * Implements the <em>divergence from randomness (DFR)</em> framework |
| * introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002. |
| * Probabilistic models of information retrieval based on measuring the |
| * divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002), |
| * 357-389. |
| * <p>The DFR scoring formula is composed of three separate components: the |
| * <em>basic model</em>, the <em>aftereffect</em> and an additional |
| * <em>normalization</em> component, represented by the classes |
| * {@code BasicModel}, {@code AfterEffect} and {@code Normalization}, |
| * respectively. The names of these classes were chosen to match the names of |
| * their counterparts in the Terrier IR engine.</p> |
| * <p>To construct a DFRSimilarity, you must specify the implementations for |
| * all three components of DFR: |
| * <ol> |
| * <li>{@link BasicModel}: Basic model of information content: |
| * <ul> |
| * <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein |
| * <li>{@link BasicModelIn}: Inverse document frequency |
| * <li>{@link BasicModelIne}: Inverse expected document |
| * frequency [mixture of Poisson and IDF] |
| * <li>{@link BasicModelIF}: Inverse term frequency |
| * [approximation of I(ne)] |
| * </ul> |
| * <li>{@link AfterEffect}: First normalization of information |
| * gain: |
| * <ul> |
| * <li>{@link AfterEffectL}: Laplace's law of succession |
| * <li>{@link AfterEffectB}: Ratio of two Bernoulli processes |
| * </ul> |
| * <li>{@link Normalization}: Second (length) normalization: |
| * <ul> |
| * <li>{@link NormalizationH1}: Uniform distribution of term |
| * frequency |
| * <li>{@link NormalizationH2}: term frequency density inversely |
| * related to length |
| * <li>{@link NormalizationH3}: term frequency normalization |
| * provided by Dirichlet prior |
| * <li>{@link NormalizationZ}: term frequency normalization provided |
| * by a Zipfian relation |
| * <li>{@link NoNormalization}: no second normalization |
| * </ul> |
| * </ol> |
| * <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query, |
| * is not handled by this implementation.</p> |
| * <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson |
| * approximation of the Binomial) and D (Divergence approximation of the |
| * Binomial) are not implemented because their formula couldn't be written in |
| * a way that makes scores non-decreasing with the normalized term frequency. |
| * @see BasicModel |
| * @see AfterEffect |
| * @see Normalization |
| * @lucene.experimental |
| */ |
| public class DFRSimilarity extends SimilarityBase { |
| /** The basic model for information content. */ |
| protected final BasicModel basicModel; |
| /** The first normalization of the information content. */ |
| protected final AfterEffect afterEffect; |
| /** The term frequency normalization. */ |
| protected final Normalization normalization; |
| |
| /** |
| * Creates DFRSimilarity from the three components. |
| * <p> |
| * Note that <code>null</code> values are not allowed: |
| * if you want no normalization, instead pass |
| * {@link NoNormalization}. |
| * @param basicModel Basic model of information content |
| * @param afterEffect First normalization of information gain |
| * @param normalization Second (length) normalization |
| */ |
| public DFRSimilarity(BasicModel basicModel, |
| AfterEffect afterEffect, |
| Normalization normalization) { |
| if (basicModel == null || afterEffect == null || normalization == null) { |
| throw new NullPointerException("null parameters not allowed."); |
| } |
| this.basicModel = basicModel; |
| this.afterEffect = afterEffect; |
| this.normalization = normalization; |
| } |
| |
| @Override |
| protected double score(BasicStats stats, double freq, double docLen) { |
| double tfn = normalization.tfn(stats, freq, docLen); |
| double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats); |
| return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn); |
| } |
| |
| @Override |
| protected void explain(List<Explanation> subs, |
| BasicStats stats, double freq, double docLen) { |
| if (stats.getBoost() != 1.0d) { |
| subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost")); |
| } |
| |
| Explanation normExpl = normalization.explain(stats, freq, docLen); |
| double tfn = normalization.tfn(stats, freq, docLen); |
| double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats); |
| subs.add(normExpl); |
| subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn)); |
| subs.add(afterEffect.explain(stats, tfn)); |
| } |
| |
| @Override |
| protected Explanation explain( |
| BasicStats stats, Explanation freq, double docLen) { |
| List<Explanation> subs = new ArrayList<>(); |
| explain(subs, stats, freq.getValue().doubleValue(), docLen); |
| |
| return Explanation.match( |
| (float) score(stats, freq.getValue().doubleValue(), docLen), |
| "score(" + getClass().getSimpleName() + ", freq=" + |
| freq.getValue() +"), computed as boost * " + |
| "basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:", |
| subs); |
| } |
| |
| @Override |
| public String toString() { |
| return "DFR " + basicModel.toString() + afterEffect.toString() |
| + normalization.toString(); |
| } |
| |
| /** |
| * Returns the basic model of information content |
| */ |
| public BasicModel getBasicModel() { |
| return basicModel; |
| } |
| |
| /** |
| * Returns the first normalization |
| */ |
| public AfterEffect getAfterEffect() { |
| return afterEffect; |
| } |
| |
| /** |
| * Returns the second normalization |
| */ |
| public Normalization getNormalization() { |
| return normalization; |
| } |
| } |