| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.similarities; |
| |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.search.Explanation; |
| import org.apache.lucene.search.similarities.Normalization.NoNormalization; |
| |
| /** |
| * Provides a framework for the family of information-based models, as described |
| * in Stéphane Clinchant and Eric Gaussier. 2010. Information-based |
| * models for ad hoc IR. In Proceeding of the 33rd international ACM SIGIR |
| * conference on Research and development in information retrieval (SIGIR '10). |
| * ACM, New York, NY, USA, 234-241. |
| * <p>The retrieval function is of the form <em>RSV(q, d) = ∑ |
| * -x<sup>q</sup><sub>w</sub> log Prob(X<sub>w</sub> ≥ |
| * t<sup>d</sup><sub>w</sub> | λ<sub>w</sub>)</em>, where |
| * <ul> |
| * <li><em>x<sup>q</sup><sub>w</sub></em> is the query boost;</li> |
| * <li><em>X<sub>w</sub></em> is a random variable that counts the occurrences |
| * of word <em>w</em>;</li> |
| * <li><em>t<sup>d</sup><sub>w</sub></em> is the normalized term frequency;</li> |
| * <li><em>λ<sub>w</sub></em> is a parameter.</li> |
| * </ul> |
| * <p>The framework described in the paper has many similarities to the DFR |
| * framework (see {@link DFRSimilarity}). It is possible that the two |
| * Similarities will be merged at one point.</p> |
| * <p>To construct an IBSimilarity, you must specify the implementations for |
| * all three components of the Information-Based model. |
| * <ol> |
| * <li>{@link Distribution}: Probabilistic distribution used to |
| * model term occurrence |
| * <ul> |
| * <li>{@link DistributionLL}: Log-logistic</li> |
| * <li>{@link DistributionLL}: Smoothed power-law</li> |
| * </ul> |
| * </li> |
| * <li>{@link Lambda}: λ<sub>w</sub> parameter of the |
| * probability distribution |
| * <ul> |
| * <li>{@link LambdaDF}: <code>N<sub>w</sub>/N</code> or average |
| * number of documents where w occurs</li> |
| * <li>{@link LambdaTTF}: <code>F<sub>w</sub>/N</code> or |
| * average number of occurrences of w in the collection</li> |
| * </ul> |
| * </li> |
| * <li>{@link Normalization}: Term frequency normalization |
| * <blockquote>Any supported DFR normalization (listed in |
| * {@link DFRSimilarity})</blockquote> |
| * </li> |
| * </ol> |
| * @see DFRSimilarity |
| * @lucene.experimental |
| */ |
| public class IBSimilarity extends SimilarityBase { |
| /** The probabilistic distribution used to model term occurrence. */ |
| protected final Distribution distribution; |
| /** The <em>lambda (λ<sub>w</sub>)</em> parameter. */ |
| protected final Lambda lambda; |
| /** The term frequency normalization. */ |
| protected final Normalization normalization; |
| |
| /** |
| * Creates IBSimilarity from the three components. |
| * <p> |
| * Note that <code>null</code> values are not allowed: |
| * if you want no normalization, instead pass |
| * {@link NoNormalization}. |
| * @param distribution probabilistic distribution modeling term occurrence |
| * @param lambda distribution's λ<sub>w</sub> parameter |
| * @param normalization term frequency normalization |
| */ |
| public IBSimilarity(Distribution distribution, |
| Lambda lambda, |
| Normalization normalization) { |
| this.distribution = distribution; |
| this.lambda = lambda; |
| this.normalization = normalization; |
| } |
| |
| @Override |
| protected double score(BasicStats stats, double freq, double docLen) { |
| return stats.getBoost() * |
| distribution.score( |
| stats, |
| normalization.tfn(stats, freq, docLen), |
| lambda.lambda(stats)); |
| } |
| |
| @Override |
| protected void explain( |
| List<Explanation> subs, BasicStats stats, double freq, double docLen) { |
| if (stats.getBoost() != 1.0d) { |
| subs.add(Explanation.match((float)stats.getBoost(), "boost, query boost")); |
| } |
| Explanation normExpl = normalization.explain(stats, freq, docLen); |
| Explanation lambdaExpl = lambda.explain(stats); |
| subs.add(normExpl); |
| subs.add(lambdaExpl); |
| subs.add(distribution.explain(stats, normExpl.getValue().floatValue(), lambdaExpl.getValue().floatValue())); |
| } |
| |
| @Override |
| protected Explanation explain( |
| BasicStats stats, Explanation freq, double docLen) { |
| List<Explanation> subs = new ArrayList<>(); |
| explain(subs, stats, freq.getValue().doubleValue(), docLen); |
| |
| return Explanation.match( |
| (float) score(stats, freq.getValue().doubleValue(), docLen), |
| "score(" + getClass().getSimpleName() + ", freq=" + |
| freq.getValue() +"), computed as boost * " + |
| "distribution.score(stats, normalization.tfn(stats, freq," + |
| " docLen), lambda.lambda(stats)) from:", |
| subs); |
| } |
| |
| |
| /** |
| * The name of IB methods follow the pattern |
| * {@code IB <distribution> <lambda><normalization>}. The name of the |
| * distribution is the same as in the original paper; for the names of lambda |
| * parameters, refer to the javadoc of the {@link Lambda} classes. |
| */ |
| @Override |
| public String toString() { |
| return "IB " + distribution.toString() + "-" + lambda.toString() |
| + normalization.toString(); |
| } |
| |
| /** |
| * Returns the distribution |
| */ |
| public Distribution getDistribution() { |
| return distribution; |
| } |
| |
| /** |
| * Returns the distribution's lambda parameter |
| */ |
| public Lambda getLambda() { |
| return lambda; |
| } |
| |
| /** |
| * Returns the term frequency normalization |
| */ |
| public Normalization getNormalization() { |
| return normalization; |
| } |
| } |