lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.similarities;


 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SmallFloat;

 /**
  * BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
  * Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3.
  * In Proceedings of the Third <b>T</b>ext <b>RE</b>trieval <b>C</b>onference (TREC 1994).
  * Gaithersburg, USA, November 1994.
  */
 public class BM25Similarity extends Similarity {
   private final float k1;
   private final float b;

   /**
    * BM25 with the supplied parameter values.
    * @param k1 Controls non-linear term frequency normalization (saturation).
    * @param b Controls to what degree document length normalizes tf values.
    * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is
    *         not within the range {@code [0..1]}
    */
   public BM25Similarity(float k1, float b) {
     if (Float.isFinite(k1) == false || k1 < 0) {
       throw new IllegalArgumentException("illegal k1 value: " + k1 + ", must be a non-negative finite value");
     }
     if (Float.isNaN(b) || b < 0 || b > 1) {
       throw new IllegalArgumentException("illegal b value: " + b + ", must be between 0 and 1");
     }
     this.k1 = k1;
     this.b  = b;
   }

   /** BM25 with these default values:
    * <ul>
    *   <li>{@code k1 = 1.2}</li>
    *   <li>{@code b = 0.75}</li>
    * </ul>
    */
   public BM25Similarity() {
     this(1.2f, 0.75f);
   }

   /** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
   protected float idf(long docFreq, long docCount) {
     return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
   }

   /** The default implementation returns <code>1</code> */
   protected float scorePayload(int doc, int start, int end, BytesRef payload) {
     return 1;
   }

   /** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
   protected float avgFieldLength(CollectionStatistics collectionStats) {
     return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
   }

   /**
    * True if overlap tokens (tokens with a position of increment of zero) are
    * discounted from the document's length.
    */
   protected boolean discountOverlaps = true;

   /** Sets whether overlap tokens (Tokens with 0 position increment) are
    *  ignored when computing norm.  By default this is true, meaning overlap
    *  tokens do not count when computing norms. */
   public void setDiscountOverlaps(boolean v) {
     discountOverlaps = v;
   }

   /**
    * Returns true if overlap tokens are discounted from the document's length.
    * @see #setDiscountOverlaps
    */
   public boolean getDiscountOverlaps() {
     return discountOverlaps;
   }

   /** Cache of decoded bytes. */
   private static final float[] LENGTH_TABLE = new float[256];

   static {
     for (int i = 0; i < 256; i++) {
       LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
     }
   }


   @Override
   public final long computeNorm(FieldInvertState state) {
     final int numTerms;
     if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
       numTerms = state.getUniqueTermCount();
     } else if (discountOverlaps) {
       numTerms = state.getLength() - state.getNumOverlap();
     } else {
       numTerms = state.getLength();
     }
     return SmallFloat.intToByte4(numTerms);
   }

   /**
    * Computes a score factor for a simple term and returns an explanation
    * for that score factor.
    *
    * <p>
    * The default implementation uses:
    *
    * <pre class="prettyprint">
    * idf(docFreq, docCount);
    * </pre>
    *
    * Note that {@link CollectionStatistics#docCount()} is used instead of
    * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
    * {@link TermStatistics#docFreq()} is used, and when the latter
    * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
    * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
    *
    * @param collectionStats collection-level statistics
    * @param termStats term-level statistics for the term
    * @return an Explain object that includes both an idf score factor
              and an explanation for the term.
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
     final long df = termStats.docFreq();
     final long docCount = collectionStats.docCount();
     final float idf = idf(df, docCount);
     return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
         Explanation.match(df, "n, number of documents containing term"),
         Explanation.match(docCount, "N, total number of documents with field"));
   }

   /**
    * Computes a score factor for a phrase.
    *
    * <p>
    * The default implementation sums the idf factor for
    * each term in the phrase.
    *
    * @param collectionStats collection-level statistics
    * @param termStats term-level statistics for the terms in the phrase
    * @return an Explain object that includes both an idf
    *         score factor for the phrase and an explanation
    *         for each term.
    */
   public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
     double idf = 0d; // sum into a double before casting into a float
     List<Explanation> details = new ArrayList<>();
     for (final TermStatistics stat : termStats ) {
       Explanation idfExplain = idfExplain(collectionStats, stat);
       details.add(idfExplain);
       idf += idfExplain.getValue().floatValue();
     }
     return Explanation.match((float) idf, "idf, sum of:", details);
   }

   @Override
   public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
     Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
     float avgdl = avgFieldLength(collectionStats);

     float[] cache = new float[256];
     for (int i = 0; i < cache.length; i++) {
       cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
     }
     return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
   }

   /** Collection statistics for the BM25 model. */
   private static class BM25Scorer extends SimScorer {
     /** query boost */
     private final float boost;
     /** k1 value for scale factor */
     private final float k1;
     /** b value for length normalization impact */
     private final float b;
     /** BM25's idf */
     private final Explanation idf;
     /** The average document length. */
     private final float avgdl;
     /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
     private final float[] cache;
     /** weight (idf * boost) */
     private final float weight;

     BM25Scorer(float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) {
       this.boost = boost;
       this.idf = idf;
       this.avgdl = avgdl;
       this.k1 = k1;
       this.b = b;
       this.cache = cache;
       this.weight = boost * idf.getValue().floatValue();
     }

     @Override
     public float score(float freq, long encodedNorm) {
       // In order to guarantee monotonicity with both freq and norm without
       // promoting to doubles, we rewrite freq / (freq + norm) to
       // 1 - 1 / (1 + freq * 1/norm).
       // freq * 1/norm is guaranteed to be monotonic for both freq and norm due
       // to the fact that multiplication and division round to the nearest
       // float. And then monotonicity is preserved through composition via
       // x -> 1 + x and x -> 1 - 1/x.
       // Finally we expand weight * (1 - 1 / (1 + freq * 1/norm)) to
       // weight - weight / (1 + freq * 1/norm), which runs slightly faster.
       float normInverse = cache[((byte) encodedNorm) & 0xFF];
       return weight - weight / (1f + freq * normInverse);
     }

     @Override
     public Explanation explain(Explanation freq, long encodedNorm) {
       List<Explanation> subs = new ArrayList<>(explainConstantFactors());
       Explanation tfExpl = explainTF(freq, encodedNorm);
       subs.add(tfExpl);
       float normInverse = cache[((byte) encodedNorm) & 0xFF];
       // not using "product of" since the rewrite that we do in score()
       // introduces a small rounding error that CheckHits complains about
       return Explanation.match(weight - weight / (1f + freq.getValue().floatValue() * normInverse),
           "score(freq="+freq.getValue()+"), computed as boost * idf * tf from:", subs);
     }

     private Explanation explainTF(Explanation freq, long norm) {
       List<Explanation> subs = new ArrayList<>();
       subs.add(freq);
       subs.add(Explanation.match(k1, "k1, term saturation parameter"));
       float doclen = LENGTH_TABLE[((byte) norm) & 0xff];
       subs.add(Explanation.match(b, "b, length normalization parameter"));
       if ((norm & 0xFF) > 39) {
         subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
       } else {
         subs.add(Explanation.match(doclen, "dl, length of field"));
       }
       subs.add(Explanation.match(avgdl, "avgdl, average length of field"));
       float normInverse = 1f / (k1 * ((1 - b) + b * doclen / avgdl));
       return Explanation.match(
           1f - 1f / (1 + freq.getValue().floatValue() * normInverse),
           "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
     }

     private List<Explanation> explainConstantFactors() {
       List<Explanation> subs = new ArrayList<>();
       // query boost
       if (boost != 1.0f) {
         subs.add(Explanation.match(boost, "boost"));
       }
       // idf
       subs.add(idf);
       return subs;
     }
   }

   @Override
   public String toString() {
     return "BM25(k1=" + k1 + ",b=" + b + ")";
   }

   /**
    * Returns the <code>k1</code> parameter
    * @see #BM25Similarity(float, float)
    */
   public final float getK1() {
     return k1;
   }

   /**
    * Returns the <code>b</code> parameter
    * @see #BM25Similarity(float, float)
    */
   public final float getB() {
     return b;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.similarities;


	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.index.FieldInvertState;
	import org.apache.lucene.index.IndexOptions;
	import org.apache.lucene.search.CollectionStatistics;
	import org.apache.lucene.search.Explanation;
	import org.apache.lucene.search.TermStatistics;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.SmallFloat;

	/**
	* BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
	* Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3.
	* In Proceedings of the Third <b>T</b>ext <b>RE</b>trieval <b>C</b>onference (TREC 1994).
	* Gaithersburg, USA, November 1994.
	*/
	public class BM25Similarity extends Similarity {
	private final float k1;
	private final float b;

	/**
	* BM25 with the supplied parameter values.
	* @param k1 Controls non-linear term frequency normalization (saturation).
	* @param b Controls to what degree document length normalizes tf values.
	* @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is
	* not within the range {@code [0..1]}
	*/
	public BM25Similarity(float k1, float b) {
	if (Float.isFinite(k1) == false \|\| k1 < 0) {
	throw new IllegalArgumentException("illegal k1 value: " + k1 + ", must be a non-negative finite value");
	}
	if (Float.isNaN(b) \|\| b < 0 \|\| b > 1) {
	throw new IllegalArgumentException("illegal b value: " + b + ", must be between 0 and 1");
	}
	this.k1 = k1;
	this.b = b;
	}

	/** BM25 with these default values:
	* <ul>
	* <li>{@code k1 = 1.2}</li>
	* <li>{@code b = 0.75}</li>
	* </ul>
	*/
	public BM25Similarity() {
	this(1.2f, 0.75f);
	}

	/** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
	protected float idf(long docFreq, long docCount) {
	return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
	}

	/** The default implementation returns <code>1</code> */
	protected float scorePayload(int doc, int start, int end, BytesRef payload) {
	return 1;
	}

	/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
	protected float avgFieldLength(CollectionStatistics collectionStats) {
	return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
	}

	/**
	* True if overlap tokens (tokens with a position of increment of zero) are
	* discounted from the document's length.
	*/
	protected boolean discountOverlaps = true;

	/** Sets whether overlap tokens (Tokens with 0 position increment) are
	* ignored when computing norm. By default this is true, meaning overlap
	* tokens do not count when computing norms. */
	public void setDiscountOverlaps(boolean v) {
	discountOverlaps = v;
	}

	/**
	* Returns true if overlap tokens are discounted from the document's length.
	* @see #setDiscountOverlaps
	*/
	public boolean getDiscountOverlaps() {
	return discountOverlaps;
	}

	/** Cache of decoded bytes. */
	private static final float[] LENGTH_TABLE = new float[256];

	static {
	for (int i = 0; i < 256; i++) {
	LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
	}
	}


	@Override
	public final long computeNorm(FieldInvertState state) {
	final int numTerms;
	if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
	numTerms = state.getUniqueTermCount();
	} else if (discountOverlaps) {
	numTerms = state.getLength() - state.getNumOverlap();
	} else {
	numTerms = state.getLength();
	}
	return SmallFloat.intToByte4(numTerms);
	}

	/**
	* Computes a score factor for a simple term and returns an explanation
	* for that score factor.
	*
	* <p>
	* The default implementation uses:
	*
	* <pre class="prettyprint">
	* idf(docFreq, docCount);
	* </pre>
	*
	* Note that {@link CollectionStatistics#docCount()} is used instead of
	* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
	* {@link TermStatistics#docFreq()} is used, and when the latter
	* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
	* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
	*
	* @param collectionStats collection-level statistics
	* @param termStats term-level statistics for the term
	* @return an Explain object that includes both an idf score factor
	and an explanation for the term.
	*/
	public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
	final long df = termStats.docFreq();
	final long docCount = collectionStats.docCount();
	final float idf = idf(df, docCount);
	return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
	Explanation.match(df, "n, number of documents containing term"),
	Explanation.match(docCount, "N, total number of documents with field"));
	}

	/**
	* Computes a score factor for a phrase.
	*
	* <p>
	* The default implementation sums the idf factor for
	* each term in the phrase.
	*
	* @param collectionStats collection-level statistics
	* @param termStats term-level statistics for the terms in the phrase
	* @return an Explain object that includes both an idf
	* score factor for the phrase and an explanation
	* for each term.
	*/
	public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
	double idf = 0d; // sum into a double before casting into a float
	List<Explanation> details = new ArrayList<>();
	for (final TermStatistics stat : termStats ) {
	Explanation idfExplain = idfExplain(collectionStats, stat);
	details.add(idfExplain);
	idf += idfExplain.getValue().floatValue();
	}
	return Explanation.match((float) idf, "idf, sum of:", details);
	}

	@Override
	public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
	Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
	float avgdl = avgFieldLength(collectionStats);

	float[] cache = new float[256];
	for (int i = 0; i < cache.length; i++) {
	cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
	}
	return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
	}

	/** Collection statistics for the BM25 model. */
	private static class BM25Scorer extends SimScorer {
	/** query boost */
	private final float boost;
	/** k1 value for scale factor */
	private final float k1;
	/** b value for length normalization impact */
	private final float b;
	/** BM25's idf */
	private final Explanation idf;
	/** The average document length. */
	private final float avgdl;
	/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
	private final float[] cache;
	/** weight (idf * boost) */
	private final float weight;

	BM25Scorer(float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) {
	this.boost = boost;
	this.idf = idf;
	this.avgdl = avgdl;
	this.k1 = k1;
	this.b = b;
	this.cache = cache;
	this.weight = boost * idf.getValue().floatValue();
	}

	@Override
	public float score(float freq, long encodedNorm) {
	// In order to guarantee monotonicity with both freq and norm without
	// promoting to doubles, we rewrite freq / (freq + norm) to
	// 1 - 1 / (1 + freq * 1/norm).
	// freq * 1/norm is guaranteed to be monotonic for both freq and norm due
	// to the fact that multiplication and division round to the nearest
	// float. And then monotonicity is preserved through composition via
	// x -> 1 + x and x -> 1 - 1/x.
	// Finally we expand weight * (1 - 1 / (1 + freq * 1/norm)) to
	// weight - weight / (1 + freq * 1/norm), which runs slightly faster.
	float normInverse = cache[((byte) encodedNorm) & 0xFF];
	return weight - weight / (1f + freq * normInverse);
	}

	@Override
	public Explanation explain(Explanation freq, long encodedNorm) {
	List<Explanation> subs = new ArrayList<>(explainConstantFactors());
	Explanation tfExpl = explainTF(freq, encodedNorm);
	subs.add(tfExpl);
	float normInverse = cache[((byte) encodedNorm) & 0xFF];
	// not using "product of" since the rewrite that we do in score()
	// introduces a small rounding error that CheckHits complains about
	return Explanation.match(weight - weight / (1f + freq.getValue().floatValue() * normInverse),
	"score(freq="+freq.getValue()+"), computed as boost * idf * tf from:", subs);
	}

	private Explanation explainTF(Explanation freq, long norm) {
	List<Explanation> subs = new ArrayList<>();
	subs.add(freq);
	subs.add(Explanation.match(k1, "k1, term saturation parameter"));
	float doclen = LENGTH_TABLE[((byte) norm) & 0xff];
	subs.add(Explanation.match(b, "b, length normalization parameter"));
	if ((norm & 0xFF) > 39) {
	subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
	} else {
	subs.add(Explanation.match(doclen, "dl, length of field"));
	}
	subs.add(Explanation.match(avgdl, "avgdl, average length of field"));
	float normInverse = 1f / (k1 * ((1 - b) + b * doclen / avgdl));
	return Explanation.match(
	1f - 1f / (1 + freq.getValue().floatValue() * normInverse),
	"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
	}

	private List<Explanation> explainConstantFactors() {
	List<Explanation> subs = new ArrayList<>();
	// query boost
	if (boost != 1.0f) {
	subs.add(Explanation.match(boost, "boost"));
	}
	// idf
	subs.add(idf);
	return subs;
	}
	}

	@Override
	public String toString() {
	return "BM25(k1=" + k1 + ",b=" + b + ")";
	}

	/**
	* Returns the <code>k1</code> parameter
	* @see #BM25Similarity(float, float)
	*/
	public final float getK1() {
	return k1;
	}

	/**
	* Returns the <code>b</code> parameter
	* @see #BM25Similarity(float, float)
	*/
	public final float getB() {
	return b;
	}
	}