blob: 4a7a00d8441412736cc28a72292909312af75b8a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.util.Arrays;
import org.apache.lucene.util.BytesRefHash;
/**
* Ranks passages found by {@link UnifiedHighlighter}.
* <p>
* Each passage is scored as a miniature document within the document.
* The final score is computed as {@link #norm} * &sum; ({@link #weight} * {@link #tf}).
* The default implementation is {@link #norm} * BM25.
*
* @lucene.experimental
*/
public class PassageScorer {
// TODO: this formula is completely made up. It might not provide relevant snippets!
/**
* BM25 k1 parameter, controls term frequency normalization
*/
final float k1;
/**
* BM25 b parameter, controls length normalization.
*/
final float b;
/**
* A pivot used for length normalization.
*/
final float pivot;
/**
* Creates PassageScorer with these default values:
* <ul>
* <li>{@code k1 = 1.2},
* <li>{@code b = 0.75}.
* <li>{@code pivot = 87}
* </ul>
*/
public PassageScorer() {
// 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
// 87 is typical average english sentence length.
this(1.2f, 0.75f, 87f);
}
/**
* Creates PassageScorer with specified scoring parameters
*
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree passage length normalizes tf values.
* @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
*/
public PassageScorer(float k1, float b, float pivot) {
this.k1 = k1;
this.b = b;
this.pivot = pivot;
}
/**
* Computes term importance, given its in-document statistics.
*
* @param contentLength length of document in characters
* @param totalTermFreq number of time term occurs in document
* @return term importance
*/
public float weight(int contentLength, int totalTermFreq) {
// approximate #docs from content length
float numDocs = 1 + contentLength / pivot;
// numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D) / (totalTermFreq + 0.5D));
}
/**
* Computes term weight, given the frequency within the passage
* and the passage's length.
*
* @param freq number of occurrences of within this passage
* @param passageLen length of the passage in characters.
* @return term weight
*/
public float tf(int freq, int passageLen) {
float norm = k1 * ((1 - b) + b * (passageLen / pivot));
return freq / (freq + norm);
}
/**
* Normalize a passage according to its position in the document.
* <p>
* Typically passages towards the beginning of the document are
* more useful for summarizing the contents.
* <p>
* The default implementation is <code>1 + 1/log(pivot + passageStart)</code>
*
* @param passageStart start offset of the passage
* @return a boost value multiplied into the passage's core.
*/
public float norm(int passageStart) {
return 1 + 1 / (float) Math.log(pivot + passageStart);
}
public float score(Passage passage, int contentLength) {
float score = 0;
BytesRefHash termsHash = new BytesRefHash();
int hitCount = passage.getNumMatches();
int[] termFreqsInPassage = new int[hitCount]; // maximum size
int[] termFreqsInDoc = new int[hitCount];
Arrays.fill(termFreqsInPassage, 0);
for (int i = 0; i < passage.getNumMatches(); i++) {
int termIndex = termsHash.add(passage.getMatchTerms()[i]);
if (termIndex < 0) {
termIndex = -(termIndex + 1);
}
else {
termFreqsInDoc[termIndex] = passage.getMatchTermFreqsInDoc()[i];
}
termFreqsInPassage[termIndex]++;
}
for (int i = 0; i < termsHash.size(); i++) {
score += tf(termFreqsInPassage[i], passage.getLength()) * weight(contentLength, termFreqsInDoc[i]);
}
score *= norm(passage.getStartOffset());
return score;
}
}