lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PassageScorer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.uhighlight;

 import java.util.Arrays;

 import org.apache.lucene.util.BytesRefHash;

 /**
  * Ranks passages found by {@link UnifiedHighlighter}.
  * <p>
  * Each passage is scored as a miniature document within the document.
  * The final score is computed as {@link #norm} * &sum; ({@link #weight} * {@link #tf}).
  * The default implementation is {@link #norm} * BM25.
  *
  * @lucene.experimental
  */
 public class PassageScorer {

   // TODO: this formula is completely made up. It might not provide relevant snippets!

   /**
    * BM25 k1 parameter, controls term frequency normalization
    */
   final float k1;
   /**
    * BM25 b parameter, controls length normalization.
    */
   final float b;
   /**
    * A pivot used for length normalization.
    */
   final float pivot;

   /**
    * Creates PassageScorer with these default values:
    * <ul>
    * <li>{@code k1 = 1.2},
    * <li>{@code b = 0.75}.
    * <li>{@code pivot = 87}
    * </ul>
    */
   public PassageScorer() {
     // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
     // 87 is typical average english sentence length.
     this(1.2f, 0.75f, 87f);
   }

   /**
    * Creates PassageScorer with specified scoring parameters
    *
    * @param k1    Controls non-linear term frequency normalization (saturation).
    * @param b     Controls to what degree passage length normalizes tf values.
    * @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
    */
   public PassageScorer(float k1, float b, float pivot) {
     this.k1 = k1;
     this.b = b;
     this.pivot = pivot;
   }

   /**
    * Computes term importance, given its in-document statistics.
    *
    * @param contentLength length of document in characters
    * @param totalTermFreq number of time term occurs in document
    * @return term importance
    */
   public float weight(int contentLength, int totalTermFreq) {
     // approximate #docs from content length
     float numDocs = 1 + contentLength / pivot;
     // numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
     return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D) / (totalTermFreq + 0.5D));
   }

   /**
    * Computes term weight, given the frequency within the passage
    * and the passage's length.
    *
    * @param freq       number of occurrences of within this passage
    * @param passageLen length of the passage in characters.
    * @return term weight
    */
   public float tf(int freq, int passageLen) {
     float norm = k1 * ((1 - b) + b * (passageLen / pivot));
     return freq / (freq + norm);
   }

   /**
    * Normalize a passage according to its position in the document.
    * <p>
    * Typically passages towards the beginning of the document are
    * more useful for summarizing the contents.
    * <p>
    * The default implementation is <code>1 + 1/log(pivot + passageStart)</code>
    *
    * @param passageStart start offset of the passage
    * @return a boost value multiplied into the passage's core.
    */
   public float norm(int passageStart) {
     return 1 + 1 / (float) Math.log(pivot + passageStart);
   }

   public float score(Passage passage, int contentLength) {
     float score = 0;
     BytesRefHash termsHash = new BytesRefHash();
     int hitCount = passage.getNumMatches();
     int[] termFreqsInPassage = new int[hitCount]; // maximum size
     int[] termFreqsInDoc = new int[hitCount];
     Arrays.fill(termFreqsInPassage, 0);

     for (int i = 0; i < passage.getNumMatches(); i++) {
       int termIndex = termsHash.add(passage.getMatchTerms()[i]);
       if (termIndex < 0) {
         termIndex = -(termIndex + 1);
       }
       else {
         termFreqsInDoc[termIndex] = passage.getMatchTermFreqsInDoc()[i];
       }
       termFreqsInPassage[termIndex]++;
     }

     for (int i = 0; i < termsHash.size(); i++) {
       score += tf(termFreqsInPassage[i], passage.getLength()) * weight(contentLength, termFreqsInDoc[i]);
     }
     score *= norm(passage.getStartOffset());
     return score;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.uhighlight;

	import java.util.Arrays;

	import org.apache.lucene.util.BytesRefHash;

	/**
	* Ranks passages found by {@link UnifiedHighlighter}.
	* <p>
	* Each passage is scored as a miniature document within the document.
	* The final score is computed as {@link #norm} * ∑ ({@link #weight} * {@link #tf}).
	* The default implementation is {@link #norm} * BM25.
	*
	* @lucene.experimental
	*/
	public class PassageScorer {

	// TODO: this formula is completely made up. It might not provide relevant snippets!

	/**
	* BM25 k1 parameter, controls term frequency normalization
	*/
	final float k1;
	/**
	* BM25 b parameter, controls length normalization.
	*/
	final float b;
	/**
	* A pivot used for length normalization.
	*/
	final float pivot;

	/**
	* Creates PassageScorer with these default values:
	* <ul>
	* <li>{@code k1 = 1.2},
	* <li>{@code b = 0.75}.
	* <li>{@code pivot = 87}
	* </ul>
	*/
	public PassageScorer() {
	// 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
	// 87 is typical average english sentence length.
	this(1.2f, 0.75f, 87f);
	}

	/**
	* Creates PassageScorer with specified scoring parameters
	*
	* @param k1 Controls non-linear term frequency normalization (saturation).
	* @param b Controls to what degree passage length normalizes tf values.
	* @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
	*/
	public PassageScorer(float k1, float b, float pivot) {
	this.k1 = k1;
	this.b = b;
	this.pivot = pivot;
	}

	/**
	* Computes term importance, given its in-document statistics.
	*
	* @param contentLength length of document in characters
	* @param totalTermFreq number of time term occurs in document
	* @return term importance
	*/
	public float weight(int contentLength, int totalTermFreq) {
	// approximate #docs from content length
	float numDocs = 1 + contentLength / pivot;
	// numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
	return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D) / (totalTermFreq + 0.5D));
	}

	/**
	* Computes term weight, given the frequency within the passage
	* and the passage's length.
	*
	* @param freq number of occurrences of within this passage
	* @param passageLen length of the passage in characters.
	* @return term weight
	*/
	public float tf(int freq, int passageLen) {
	float norm = k1 * ((1 - b) + b * (passageLen / pivot));
	return freq / (freq + norm);
	}

	/**
	* Normalize a passage according to its position in the document.
	* <p>
	* Typically passages towards the beginning of the document are
	* more useful for summarizing the contents.
	* <p>
	* The default implementation is <code>1 + 1/log(pivot + passageStart)</code>
	*
	* @param passageStart start offset of the passage
	* @return a boost value multiplied into the passage's core.
	*/
	public float norm(int passageStart) {
	return 1 + 1 / (float) Math.log(pivot + passageStart);
	}

	public float score(Passage passage, int contentLength) {
	float score = 0;
	BytesRefHash termsHash = new BytesRefHash();
	int hitCount = passage.getNumMatches();
	int[] termFreqsInPassage = new int[hitCount]; // maximum size
	int[] termFreqsInDoc = new int[hitCount];
	Arrays.fill(termFreqsInPassage, 0);

	for (int i = 0; i < passage.getNumMatches(); i++) {
	int termIndex = termsHash.add(passage.getMatchTerms()[i]);
	if (termIndex < 0) {
	termIndex = -(termIndex + 1);
	}
	else {
	termFreqsInDoc[termIndex] = passage.getMatchTermFreqsInDoc()[i];
	}
	termFreqsInPassage[termIndex]++;
	}

	for (int i = 0; i < termsHash.size(); i++) {
	score += tf(termFreqsInPassage[i], passage.getLength()) * weight(contentLength, termFreqsInDoc[i]);
	}
	score *= norm(passage.getStartOffset());
	return score;
	}
	}