lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.similarities;

 import java.util.ArrayList;
 import java.util.List;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.SmallFloat;

 /**
  * A subclass of {@code Similarity} that provides a simplified API for its descendants. Subclasses
  * are only required to implement the {@link #score} and {@link #toString()} methods. Implementing
  * {@link #explain(List, BasicStats, double, double)} is optional, inasmuch as SimilarityBase
  * already provides a basic explanation of the score and the term frequency. However, implementers
  * of a subclass are encouraged to include as much detail about the scoring method as possible.
  *
  * <p>Note: multi-word queries such as phrase queries are scored in a different way than Lucene's
  * default ranking algorithm: whereas it "fakes" an IDF value for the phrase as a whole (since it
  * does not know it), this class instead scores phrases as a summation of the individual term
  * scores.
  *
  * @lucene.experimental
  */
 public abstract class SimilarityBase extends Similarity {
   /** For {@link #log2(double)}. Precomputed for efficiency reasons. */
   private static final double LOG_2 = Math.log(2);

   /**
    * True if overlap tokens (tokens with a position of increment of zero) are discounted from the
    * document's length.
    */
   protected boolean discountOverlaps = true;

   /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
   public SimilarityBase() {}

   /**
    * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
    * norm. By default this is true, meaning overlap tokens do not count when computing norms.
    *
    * @lucene.experimental
    * @see #computeNorm
    */
   public void setDiscountOverlaps(boolean v) {
     discountOverlaps = v;
   }

   /**
    * Returns true if overlap tokens are discounted from the document's length.
    *
    * @see #setDiscountOverlaps
    */
   public boolean getDiscountOverlaps() {
     return discountOverlaps;
   }

   @Override
   public final SimScorer scorer(
       float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
     SimScorer weights[] = new SimScorer[termStats.length];
     for (int i = 0; i < termStats.length; i++) {
       BasicStats stats = newStats(collectionStats.field(), boost);
       fillBasicStats(stats, collectionStats, termStats[i]);
       weights[i] = new BasicSimScorer(stats);
     }
     if (weights.length == 1) {
       return weights[0];
     } else {
       return new MultiSimilarity.MultiSimScorer(weights);
     }
   }

   /** Factory method to return a custom stats object */
   protected BasicStats newStats(String field, double boost) {
     return new BasicStats(field, boost);
   }

   /**
    * Fills all member fields defined in {@code BasicStats} in {@code stats}. Subclasses can override
    * this method to fill additional stats.
    */
   protected void fillBasicStats(
       BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
     // TODO: validate this for real, somewhere else
     assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
     assert termStats.docFreq() <= collectionStats.sumDocFreq();

     // TODO: add sumDocFreq for field (numberOfFieldPostings)
     stats.setNumberOfDocuments(collectionStats.docCount());
     stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
     stats.setAvgFieldLength(
         collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
     stats.setDocFreq(termStats.docFreq());
     stats.setTotalTermFreq(termStats.totalTermFreq());
   }

   /**
    * Scores the document {@code doc}.
    *
    * <p>Subclasses must apply their scoring formula in this class.
    *
    * @param stats the corpus level statistics.
    * @param freq the term frequency.
    * @param docLen the document length.
    * @return the score.
    */
   protected abstract double score(BasicStats stats, double freq, double docLen);

   /**
    * Subclasses should implement this method to explain the score. {@code expl} already contains the
    * score, the name of the class and the doc id, as well as the term frequency and its explanation;
    * subclasses can add additional clauses to explain details of their scoring formulae.
    *
    * <p>The default implementation does nothing.
    *
    * @param subExpls the list of details of the explanation to extend
    * @param stats the corpus level statistics.
    * @param freq the term frequency.
    * @param docLen the document length.
    */
   protected void explain(
       List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {}

   /**
    * Explains the score. The implementation here provides a basic explanation in the format
    * <em>score(name-of-similarity, doc=doc-id, freq=term-frequency), computed from:</em>, and
    * attaches the score (computed via the {@link #score(BasicStats, double, double)} method) and the
    * explanation for the term frequency. Subclasses content with this format may add additional
    * details in {@link #explain(List, BasicStats, double, double)}.
    *
    * @param stats the corpus level statistics.
    * @param freq the term frequency and its explanation.
    * @param docLen the document length.
    * @return the explanation.
    */
   protected Explanation explain(BasicStats stats, Explanation freq, double docLen) {
     List<Explanation> subs = new ArrayList<>();
     explain(subs, stats, freq.getValue().floatValue(), docLen);

     return Explanation.match(
         (float) score(stats, freq.getValue().floatValue(), docLen),
         "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() + "), computed from:",
         subs);
   }

   /**
    * Subclasses must override this method to return the name of the Similarity and preferably the
    * values of parameters (if any) as well.
    */
   @Override
   public abstract String toString();

   // ------------------------------ Norm handling ------------------------------

   /** Cache of decoded bytes. */
   private static final float[] LENGTH_TABLE = new float[256];

   static {
     for (int i = 0; i < 256; i++) {
       LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
     }
   }

   /** Encodes the document length in the same way as {@link BM25Similarity}. */
   @Override
   public final long computeNorm(FieldInvertState state) {
     final int numTerms;
     if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
       numTerms = state.getUniqueTermCount();
     } else if (discountOverlaps) {
       numTerms = state.getLength() - state.getNumOverlap();
     } else {
       numTerms = state.getLength();
     }
     return SmallFloat.intToByte4(numTerms);
   }

   // ----------------------------- Static methods ------------------------------

   /** Returns the base two logarithm of {@code x}. */
   public static double log2(double x) {
     // Put this to a 'util' class if we need more of these.
     return Math.log(x) / LOG_2;
   }

   // --------------------------------- Classes ---------------------------------

   /**
    * Delegates the {@link #score(float, long)} and {@link #explain(Explanation, long)} methods to
    * {@link SimilarityBase#score(BasicStats, double, double)} and {@link
    * SimilarityBase#explain(BasicStats, Explanation, double)}, respectively.
    */
   final class BasicSimScorer extends SimScorer {
     final BasicStats stats;

     BasicSimScorer(BasicStats stats) {
       this.stats = stats;
     }

     double getLengthValue(long norm) {
       return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
     }

     @Override
     public float score(float freq, long norm) {
       return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
     }

     @Override
     public Explanation explain(Explanation freq, long norm) {
       return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.similarities;

	import java.util.ArrayList;
	import java.util.List;
	import org.apache.lucene.index.FieldInvertState;
	import org.apache.lucene.index.IndexOptions;
	import org.apache.lucene.search.CollectionStatistics;
	import org.apache.lucene.search.Explanation;
	import org.apache.lucene.search.TermStatistics;
	import org.apache.lucene.util.SmallFloat;

	/**
	* A subclass of {@code Similarity} that provides a simplified API for its descendants. Subclasses
	* are only required to implement the {@link #score} and {@link #toString()} methods. Implementing
	* {@link #explain(List, BasicStats, double, double)} is optional, inasmuch as SimilarityBase
	* already provides a basic explanation of the score and the term frequency. However, implementers
	* of a subclass are encouraged to include as much detail about the scoring method as possible.
	*
	* <p>Note: multi-word queries such as phrase queries are scored in a different way than Lucene's
	* default ranking algorithm: whereas it "fakes" an IDF value for the phrase as a whole (since it
	* does not know it), this class instead scores phrases as a summation of the individual term
	* scores.
	*
	* @lucene.experimental
	*/
	public abstract class SimilarityBase extends Similarity {
	/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
	private static final double LOG_2 = Math.log(2);

	/**
	* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
	* document's length.
	*/
	protected boolean discountOverlaps = true;

	/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
	public SimilarityBase() {}

	/**
	* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
	* norm. By default this is true, meaning overlap tokens do not count when computing norms.
	*
	* @lucene.experimental
	* @see #computeNorm
	*/
	public void setDiscountOverlaps(boolean v) {
	discountOverlaps = v;
	}

	/**
	* Returns true if overlap tokens are discounted from the document's length.
	*
	* @see #setDiscountOverlaps
	*/
	public boolean getDiscountOverlaps() {
	return discountOverlaps;
	}

	@Override
	public final SimScorer scorer(
	float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
	SimScorer weights[] = new SimScorer[termStats.length];
	for (int i = 0; i < termStats.length; i++) {
	BasicStats stats = newStats(collectionStats.field(), boost);
	fillBasicStats(stats, collectionStats, termStats[i]);
	weights[i] = new BasicSimScorer(stats);
	}
	if (weights.length == 1) {
	return weights[0];
	} else {
	return new MultiSimilarity.MultiSimScorer(weights);
	}
	}

	/** Factory method to return a custom stats object */
	protected BasicStats newStats(String field, double boost) {
	return new BasicStats(field, boost);
	}

	/**
	* Fills all member fields defined in {@code BasicStats} in {@code stats}. Subclasses can override
	* this method to fill additional stats.
	*/
	protected void fillBasicStats(
	BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
	// TODO: validate this for real, somewhere else
	assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
	assert termStats.docFreq() <= collectionStats.sumDocFreq();

	// TODO: add sumDocFreq for field (numberOfFieldPostings)
	stats.setNumberOfDocuments(collectionStats.docCount());
	stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
	stats.setAvgFieldLength(
	collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
	stats.setDocFreq(termStats.docFreq());
	stats.setTotalTermFreq(termStats.totalTermFreq());
	}

	/**
	* Scores the document {@code doc}.
	*
	* <p>Subclasses must apply their scoring formula in this class.
	*
	* @param stats the corpus level statistics.
	* @param freq the term frequency.
	* @param docLen the document length.
	* @return the score.
	*/
	protected abstract double score(BasicStats stats, double freq, double docLen);

	/**
	* Subclasses should implement this method to explain the score. {@code expl} already contains the
	* score, the name of the class and the doc id, as well as the term frequency and its explanation;
	* subclasses can add additional clauses to explain details of their scoring formulae.
	*
	* <p>The default implementation does nothing.
	*
	* @param subExpls the list of details of the explanation to extend
	* @param stats the corpus level statistics.
	* @param freq the term frequency.
	* @param docLen the document length.
	*/
	protected void explain(
	List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {}

	/**
	* Explains the score. The implementation here provides a basic explanation in the format
	* <em>score(name-of-similarity, doc=doc-id, freq=term-frequency), computed from:</em>, and
	* attaches the score (computed via the {@link #score(BasicStats, double, double)} method) and the
	* explanation for the term frequency. Subclasses content with this format may add additional
	* details in {@link #explain(List, BasicStats, double, double)}.
	*
	* @param stats the corpus level statistics.
	* @param freq the term frequency and its explanation.
	* @param docLen the document length.
	* @return the explanation.
	*/
	protected Explanation explain(BasicStats stats, Explanation freq, double docLen) {
	List<Explanation> subs = new ArrayList<>();
	explain(subs, stats, freq.getValue().floatValue(), docLen);

	return Explanation.match(
	(float) score(stats, freq.getValue().floatValue(), docLen),
	"score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() + "), computed from:",
	subs);
	}

	/**
	* Subclasses must override this method to return the name of the Similarity and preferably the
	* values of parameters (if any) as well.
	*/
	@Override
	public abstract String toString();

	// ------------------------------ Norm handling ------------------------------

	/** Cache of decoded bytes. */
	private static final float[] LENGTH_TABLE = new float[256];

	static {
	for (int i = 0; i < 256; i++) {
	LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
	}
	}

	/** Encodes the document length in the same way as {@link BM25Similarity}. */
	@Override
	public final long computeNorm(FieldInvertState state) {
	final int numTerms;
	if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
	numTerms = state.getUniqueTermCount();
	} else if (discountOverlaps) {
	numTerms = state.getLength() - state.getNumOverlap();
	} else {
	numTerms = state.getLength();
	}
	return SmallFloat.intToByte4(numTerms);
	}

	// ----------------------------- Static methods ------------------------------

	/** Returns the base two logarithm of {@code x}. */
	public static double log2(double x) {
	// Put this to a 'util' class if we need more of these.
	return Math.log(x) / LOG_2;
	}

	// --------------------------------- Classes ---------------------------------

	/**
	* Delegates the {@link #score(float, long)} and {@link #explain(Explanation, long)} methods to
	* {@link SimilarityBase#score(BasicStats, double, double)} and {@link
	* SimilarityBase#explain(BasicStats, Explanation, double)}, respectively.
	*/
	final class BasicSimScorer extends SimScorer {
	final BasicStats stats;

	BasicSimScorer(BasicStats stats) {
	this.stats = stats;
	}

	double getLengthValue(long norm) {
	return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
	}

	@Override
	public float score(float freq, long norm) {
	return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
	}

	@Override
	public Explanation explain(Explanation freq, long norm) {
	return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
	}
	}
	}