lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.similarities;

 import java.util.Collections;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.util.SmallFloat;

 /**
  * Similarity defines the components of Lucene scoring.
  *
  * <p>Expert: Scoring API.
  *
  * <p>This is a low-level API, you should only extend this API if you want to implement an
  * information retrieval <i>model</i>. If you are instead looking for a convenient way to alter
  * Lucene's scoring, consider just tweaking the default implementation: {@link BM25Similarity} or
  * extend {@link SimilarityBase}, which makes it easy to compute a score from index statistics.
  *
  * <p>Similarity determines how Lucene weights terms, and Lucene interacts with this class at both
  * <a href="#indextime">index-time</a> and <a href="#querytime">query-time</a>.
  *
  * <p><a id="indextime">Indexing Time</a> At indexing time, the indexer calls {@link
  * #computeNorm(FieldInvertState)}, allowing the Similarity implementation to set a per-document
  * value for the field that will be later accessible via {@link
  * org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption about what
  * is in this norm, but it is most useful for encoding length normalization information.
  *
  * <p>Implementations should carefully consider how the normalization is encoded: while Lucene's
  * {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a
  * single byte, this might not be suitable for all purposes.
  *
  * <p>Many formulas require the use of average document length, which can be computed via a
  * combination of {@link CollectionStatistics#sumTotalTermFreq()} and {@link
  * CollectionStatistics#docCount()}.
  *
  * <p>Additional scoring factors can be stored in named {@link NumericDocValuesField}s and accessed
  * at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
  * However this should not be done in the {@link Similarity} but externally, for instance by using
  * <code>FunctionScoreQuery</code>.
  *
  * <p>Finally, using index-time boosts (either via folding into the normalization byte or via
  * DocValues), is an inefficient way to boost the scores of different fields if the boost will be
  * the same for every document, instead the Similarity can simply take a constant boost parameter
  * <i>C</i>, and {@link PerFieldSimilarityWrapper} can return different instances with different
  * boosts depending upon field name.
  *
  * <p><a id="querytime">Query time</a> At query-time, Queries interact with the Similarity via these
  * steps:
  *
  * <ol>
  *   <li>The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a
  *       single time, allowing the implementation to compute any statistics (such as IDF, average
  *       document length, etc) across <i>the entire collection</i>. The {@link TermStatistics} and
  *       {@link CollectionStatistics} passed in already contain all of the raw statistics involved,
  *       so a Similarity can freely use any combination of statistics without causing any additional
  *       I/O. Lucene makes no assumption about what is stored in the returned {@link
  *       Similarity.SimScorer} object.
  *   <li>Then {@link SimScorer#score(float, long)} is called for every matching document to compute
  *       its score.
  * </ol>
  *
  * <p><a id="explaintime">Explanations</a> When {@link
  * IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the
  * Similarity's DocScorer for an explanation of how it computed its score. The query passes in a the
  * document id and an explanation of how the frequency was computed.
  *
  * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity)
  * @see IndexSearcher#setSimilarity(Similarity)
  * @lucene.experimental
  */
 public abstract class Similarity {
   /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
   // Explicitly declared so that we have non-empty javadoc
   protected Similarity() {}

   /**
    * Computes the normalization value for a field, given the accumulated state of term processing
    * for this field (see {@link FieldInvertState}).
    *
    * <p>Matches in longer fields are less precise, so implementations of this method usually set
    * smaller values when <code>state.getLength()</code> is large, and larger values when <code>
    * state.getLength()</code> is small.
    *
    * <p>Note that for a given term-document frequency, greater unsigned norms must produce scores
    * that are lower or equal, ie. for two encoded norms {@code n1} and {@code n2} so that {@code
    * Long.compareUnsigned(n1, n2) > 0} then {@code SimScorer.score(freq, n1) <=
    * SimScorer.score(freq, n2)} for any legal {@code freq}.
    *
    * <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
    *
    * @lucene.experimental
    * @param state current processing state for this field
    * @return computed norm value
    */
   public abstract long computeNorm(FieldInvertState state);

   /**
    * Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring
    * a query.
    *
    * @param boost a multiplicative factor to apply to the produces scores
    * @param collectionStats collection-level statistics, such as the number of tokens in the
    *     collection.
    * @param termStats term-level statistics, such as the document frequency of a term across the
    *     collection.
    * @return SimWeight object with the information this Similarity needs to score a query.
    */
   public abstract SimScorer scorer(
       float boost, CollectionStatistics collectionStats, TermStatistics... termStats);

   /**
    * Stores the weight for a query across the indexed collection. This abstract implementation is
    * empty; descendants of {@code Similarity} should subclass {@code SimWeight} and define the
    * statistics they require in the subclass. Examples include idf, average field length, etc.
    */
   public abstract static class SimScorer {

     /** Sole constructor. (For invocation by subclass constructors.) */
     protected SimScorer() {}

     /**
      * Score a single document. {@code freq} is the document-term sloppy frequency and must be
      * finite and positive. {@code norm} is the encoded normalization factor as computed by {@link
      * Similarity#computeNorm(FieldInvertState)} at index time, or {@code 1} if norms are disabled.
      * {@code norm} is never {@code 0}.
      *
      * <p>Score must not decrease when {@code freq} increases, ie. if {@code freq1 > freq2}, then
      * {@code score(freq1, norm) >= score(freq2, norm)} for any value of {@code norm} that may be
      * produced by {@link Similarity#computeNorm(FieldInvertState)}.
      *
      * <p>Score must not increase when the unsigned {@code norm} increases, ie. if {@code
      * Long.compareUnsigned(norm1, norm2) > 0} then {@code score(freq, norm1) <= score(freq, norm2)}
      * for any legal {@code freq}.
      *
      * <p>As a consequence, the maximum score that this scorer can produce is bound by {@code
      * score(Float.MAX_VALUE, 1)}.
      *
      * @param freq sloppy term frequency, must be finite and positive
      * @param norm encoded normalization factor or {@code 1} if norms are disabled
      * @return document's score
      */
     public abstract float score(float freq, long norm);

     /**
      * Explain the score for a single document
      *
      * @param freq Explanation of how the sloppy term frequency was computed
      * @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or
      *     {@code 1} if norms are disabled
      * @return document's score
      */
     public Explanation explain(Explanation freq, long norm) {
       return Explanation.match(
           score(freq.getValue().floatValue(), norm),
           "score(freq=" + freq.getValue() + "), with freq of:",
           Collections.singleton(freq));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.similarities;

	import java.util.Collections;
	import org.apache.lucene.document.NumericDocValuesField;
	import org.apache.lucene.index.FieldInvertState;
	import org.apache.lucene.search.CollectionStatistics;
	import org.apache.lucene.search.Explanation;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.TermStatistics;
	import org.apache.lucene.util.SmallFloat;

	/**
	* Similarity defines the components of Lucene scoring.
	*
	* <p>Expert: Scoring API.
	*
	* <p>This is a low-level API, you should only extend this API if you want to implement an
	* information retrieval <i>model</i>. If you are instead looking for a convenient way to alter
	* Lucene's scoring, consider just tweaking the default implementation: {@link BM25Similarity} or
	* extend {@link SimilarityBase}, which makes it easy to compute a score from index statistics.
	*
	* <p>Similarity determines how Lucene weights terms, and Lucene interacts with this class at both
	* <a href="#indextime">index-time</a> and <a href="#querytime">query-time</a>.
	*
	* <p><a id="indextime">Indexing Time</a> At indexing time, the indexer calls {@link
	* #computeNorm(FieldInvertState)}, allowing the Similarity implementation to set a per-document
	* value for the field that will be later accessible via {@link
	* org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption about what
	* is in this norm, but it is most useful for encoding length normalization information.
	*
	* <p>Implementations should carefully consider how the normalization is encoded: while Lucene's
	* {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a
	* single byte, this might not be suitable for all purposes.
	*
	* <p>Many formulas require the use of average document length, which can be computed via a
	* combination of {@link CollectionStatistics#sumTotalTermFreq()} and {@link
	* CollectionStatistics#docCount()}.
	*
	* <p>Additional scoring factors can be stored in named {@link NumericDocValuesField}s and accessed
	* at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
	* However this should not be done in the {@link Similarity} but externally, for instance by using
	* <code>FunctionScoreQuery</code>.
	*
	* <p>Finally, using index-time boosts (either via folding into the normalization byte or via
	* DocValues), is an inefficient way to boost the scores of different fields if the boost will be
	* the same for every document, instead the Similarity can simply take a constant boost parameter
	* <i>C</i>, and {@link PerFieldSimilarityWrapper} can return different instances with different
	* boosts depending upon field name.
	*
	* <p><a id="querytime">Query time</a> At query-time, Queries interact with the Similarity via these
	* steps:
	*
	* <ol>
	* <li>The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a
	* single time, allowing the implementation to compute any statistics (such as IDF, average
	* document length, etc) across <i>the entire collection</i>. The {@link TermStatistics} and
	* {@link CollectionStatistics} passed in already contain all of the raw statistics involved,
	* so a Similarity can freely use any combination of statistics without causing any additional
	* I/O. Lucene makes no assumption about what is stored in the returned {@link
	* Similarity.SimScorer} object.
	* <li>Then {@link SimScorer#score(float, long)} is called for every matching document to compute
	* its score.
	* </ol>
	*
	* <p><a id="explaintime">Explanations</a> When {@link
	* IndexSearcher#explain(org.apache.lucene.search.Query, int)} is called, queries consult the
	* Similarity's DocScorer for an explanation of how it computed its score. The query passes in a the
	* document id and an explanation of how the frequency was computed.
	*
	* @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity)
	* @see IndexSearcher#setSimilarity(Similarity)
	* @lucene.experimental
	*/
	public abstract class Similarity {
	/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
	// Explicitly declared so that we have non-empty javadoc
	protected Similarity() {}

	/**
	* Computes the normalization value for a field, given the accumulated state of term processing
	* for this field (see {@link FieldInvertState}).
	*
	* <p>Matches in longer fields are less precise, so implementations of this method usually set
	* smaller values when <code>state.getLength()</code> is large, and larger values when <code>
	* state.getLength()</code> is small.
	*
	* <p>Note that for a given term-document frequency, greater unsigned norms must produce scores
	* that are lower or equal, ie. for two encoded norms {@code n1} and {@code n2} so that {@code
	* Long.compareUnsigned(n1, n2) > 0} then {@code SimScorer.score(freq, n1) <=
	* SimScorer.score(freq, n2)} for any legal {@code freq}.
	*
	* <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
	*
	* @lucene.experimental
	* @param state current processing state for this field
	* @return computed norm value
	*/
	public abstract long computeNorm(FieldInvertState state);

	/**
	* Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring
	* a query.
	*
	* @param boost a multiplicative factor to apply to the produces scores
	* @param collectionStats collection-level statistics, such as the number of tokens in the
	* collection.
	* @param termStats term-level statistics, such as the document frequency of a term across the
	* collection.
	* @return SimWeight object with the information this Similarity needs to score a query.
	*/
	public abstract SimScorer scorer(
	float boost, CollectionStatistics collectionStats, TermStatistics... termStats);

	/**
	* Stores the weight for a query across the indexed collection. This abstract implementation is
	* empty; descendants of {@code Similarity} should subclass {@code SimWeight} and define the
	* statistics they require in the subclass. Examples include idf, average field length, etc.
	*/
	public abstract static class SimScorer {

	/** Sole constructor. (For invocation by subclass constructors.) */
	protected SimScorer() {}

	/**
	* Score a single document. {@code freq} is the document-term sloppy frequency and must be
	* finite and positive. {@code norm} is the encoded normalization factor as computed by {@link
	* Similarity#computeNorm(FieldInvertState)} at index time, or {@code 1} if norms are disabled.
	* {@code norm} is never {@code 0}.
	*
	* <p>Score must not decrease when {@code freq} increases, ie. if {@code freq1 > freq2}, then
	* {@code score(freq1, norm) >= score(freq2, norm)} for any value of {@code norm} that may be
	* produced by {@link Similarity#computeNorm(FieldInvertState)}.
	*
	* <p>Score must not increase when the unsigned {@code norm} increases, ie. if {@code
	* Long.compareUnsigned(norm1, norm2) > 0} then {@code score(freq, norm1) <= score(freq, norm2)}
	* for any legal {@code freq}.
	*
	* <p>As a consequence, the maximum score that this scorer can produce is bound by {@code
	* score(Float.MAX_VALUE, 1)}.
	*
	* @param freq sloppy term frequency, must be finite and positive
	* @param norm encoded normalization factor or {@code 1} if norms are disabled
	* @return document's score
	*/
	public abstract float score(float freq, long norm);

	/**
	* Explain the score for a single document
	*
	* @param freq Explanation of how the sloppy term frequency was computed
	* @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or
	* {@code 1} if norms are disabled
	* @return document's score
	*/
	public Explanation explain(Explanation freq, long norm) {
	return Explanation.match(
	score(freq.getValue().floatValue(), norm),
	"score(freq=" + freq.getValue() + "), with freq of:",
	Collections.singleton(freq));
	}
	}
	}