| package org.apache.lucene.search.similarities; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.index.AtomicReaderContext; |
| import org.apache.lucene.index.FieldInvertState; |
| import org.apache.lucene.index.NumericDocValues; |
| import org.apache.lucene.search.CollectionStatistics; |
| import org.apache.lucene.search.Explanation; |
| import org.apache.lucene.search.TermStatistics; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.SmallFloat; |
| |
| /** |
| * A subclass of {@code Similarity} that provides a simplified API for its |
| * descendants. Subclasses are only required to implement the {@link #score} |
| * and {@link #toString()} methods. Implementing |
| * {@link #explain(Explanation, BasicStats, int, float, float)} is optional, |
| * inasmuch as SimilarityBase already provides a basic explanation of the score |
| * and the term frequency. However, implementers of a subclass are encouraged to |
| * include as much detail about the scoring method as possible. |
| * <p> |
| * Note: multi-word queries such as phrase queries are scored in a different way |
| * than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for |
| * the phrase as a whole (since it does not know it), this class instead scores |
| * phrases as a summation of the individual term scores. |
| * @lucene.experimental |
| */ |
| public abstract class SimilarityBase extends Similarity { |
| /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ |
| private static final double LOG_2 = Math.log(2); |
| |
| /** |
| * True if overlap tokens (tokens with a position of increment of zero) are |
| * discounted from the document's length. |
| */ |
| protected boolean discountOverlaps = true; |
| |
| /** |
| * Sole constructor. (For invocation by subclass |
| * constructors, typically implicit.) |
| */ |
| public SimilarityBase() {} |
| |
| /** Determines whether overlap tokens (Tokens with |
| * 0 position increment) are ignored when computing |
| * norm. By default this is true, meaning overlap |
| * tokens do not count when computing norms. |
| * |
| * @lucene.experimental |
| * |
| * @see #computeNorm |
| */ |
| public void setDiscountOverlaps(boolean v) { |
| discountOverlaps = v; |
| } |
| |
| /** |
| * Returns true if overlap tokens are discounted from the document's length. |
| * @see #setDiscountOverlaps |
| */ |
| public boolean getDiscountOverlaps() { |
| return discountOverlaps; |
| } |
| |
| @Override |
| public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { |
| BasicStats stats[] = new BasicStats[termStats.length]; |
| for (int i = 0; i < termStats.length; i++) { |
| stats[i] = newStats(collectionStats.field(), queryBoost); |
| fillBasicStats(stats[i], collectionStats, termStats[i]); |
| } |
| return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); |
| } |
| |
| /** Factory method to return a custom stats object */ |
| protected BasicStats newStats(String field, float queryBoost) { |
| return new BasicStats(field, queryBoost); |
| } |
| |
| /** Fills all member fields defined in {@code BasicStats} in {@code stats}. |
| * Subclasses can override this method to fill additional stats. */ |
| protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { |
| // #positions(field) must be >= #positions(term) |
| assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq(); |
| long numberOfDocuments = collectionStats.maxDoc(); |
| |
| long docFreq = termStats.docFreq(); |
| long totalTermFreq = termStats.totalTermFreq(); |
| |
| // codec does not supply totalTermFreq: substitute docFreq |
| if (totalTermFreq == -1) { |
| totalTermFreq = docFreq; |
| } |
| |
| final long numberOfFieldTokens; |
| final float avgFieldLength; |
| |
| long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); |
| |
| if (sumTotalTermFreq <= 0) { |
| // field does not exist; |
| // We have to provide something if codec doesnt supply these measures, |
| // or if someone omitted frequencies for the field... negative values cause |
| // NaN/Inf for some scorers. |
| numberOfFieldTokens = docFreq; |
| avgFieldLength = 1; |
| } else { |
| numberOfFieldTokens = sumTotalTermFreq; |
| avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; |
| } |
| |
| // TODO: add sumDocFreq for field (numberOfFieldPostings) |
| stats.setNumberOfDocuments(numberOfDocuments); |
| stats.setNumberOfFieldTokens(numberOfFieldTokens); |
| stats.setAvgFieldLength(avgFieldLength); |
| stats.setDocFreq(docFreq); |
| stats.setTotalTermFreq(totalTermFreq); |
| } |
| |
| /** |
| * Scores the document {@code doc}. |
| * <p>Subclasses must apply their scoring formula in this class.</p> |
| * @param stats the corpus level statistics. |
| * @param freq the term frequency. |
| * @param docLen the document length. |
| * @return the score. |
| */ |
| protected abstract float score(BasicStats stats, float freq, float docLen); |
| |
| /** |
| * Subclasses should implement this method to explain the score. {@code expl} |
| * already contains the score, the name of the class and the doc id, as well |
| * as the term frequency and its explanation; subclasses can add additional |
| * clauses to explain details of their scoring formulae. |
| * <p>The default implementation does nothing.</p> |
| * |
| * @param expl the explanation to extend with details. |
| * @param stats the corpus level statistics. |
| * @param doc the document id. |
| * @param freq the term frequency. |
| * @param docLen the document length. |
| */ |
| protected void explain( |
| Explanation expl, BasicStats stats, int doc, float freq, float docLen) {} |
| |
| /** |
| * Explains the score. The implementation here provides a basic explanation |
| * in the format <em>score(name-of-similarity, doc=doc-id, |
| * freq=term-frequency), computed from:</em>, and |
| * attaches the score (computed via the {@link #score(BasicStats, float, float)} |
| * method) and the explanation for the term frequency. Subclasses content with |
| * this format may add additional details in |
| * {@link #explain(Explanation, BasicStats, int, float, float)}. |
| * |
| * @param stats the corpus level statistics. |
| * @param doc the document id. |
| * @param freq the term frequency and its explanation. |
| * @param docLen the document length. |
| * @return the explanation. |
| */ |
| protected Explanation explain( |
| BasicStats stats, int doc, Explanation freq, float docLen) { |
| Explanation result = new Explanation(); |
| result.setValue(score(stats, freq.getValue(), docLen)); |
| result.setDescription("score(" + getClass().getSimpleName() + |
| ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:"); |
| result.addDetail(freq); |
| |
| explain(result, stats, doc, freq.getValue(), docLen); |
| |
| return result; |
| } |
| |
| @Override |
| public SimScorer simScorer(SimWeight stats, AtomicReaderContext context) throws IOException { |
| if (stats instanceof MultiSimilarity.MultiStats) { |
| // a multi term query (e.g. phrase). return the summation, |
| // scoring almost as if it were boolean query |
| SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; |
| SimScorer subScorers[] = new SimScorer[subStats.length]; |
| for (int i = 0; i < subScorers.length; i++) { |
| BasicStats basicstats = (BasicStats) subStats[i]; |
| subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field)); |
| } |
| return new MultiSimilarity.MultiSimScorer(subScorers); |
| } else { |
| BasicStats basicstats = (BasicStats) stats; |
| return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field)); |
| } |
| } |
| |
| /** |
| * Subclasses must override this method to return the name of the Similarity |
| * and preferably the values of parameters (if any) as well. |
| */ |
| @Override |
| public abstract String toString(); |
| |
| // ------------------------------ Norm handling ------------------------------ |
| |
| /** Norm -> document length map. */ |
| private static final float[] NORM_TABLE = new float[256]; |
| |
| static { |
| for (int i = 0; i < 256; i++) { |
| float floatNorm = SmallFloat.byte315ToFloat((byte)i); |
| NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm); |
| } |
| } |
| |
| /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */ |
| @Override |
| public long computeNorm(FieldInvertState state) { |
| final float numTerms; |
| if (discountOverlaps) |
| numTerms = state.getLength() - state.getNumOverlap(); |
| else |
| numTerms = state.getLength() / state.getBoost(); |
| return encodeNormValue(state.getBoost(), numTerms); |
| } |
| |
| /** Decodes a normalization factor (document length) stored in an index. |
| * @see #encodeNormValue(float,float) |
| */ |
| protected float decodeNormValue(byte norm) { |
| return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127 |
| } |
| |
| /** Encodes the length to a byte via SmallFloat. */ |
| protected byte encodeNormValue(float boost, float length) { |
| return SmallFloat.floatToByte315((boost / (float) Math.sqrt(length))); |
| } |
| |
| // ----------------------------- Static methods ------------------------------ |
| |
| /** Returns the base two logarithm of {@code x}. */ |
| public static double log2(double x) { |
| // Put this to a 'util' class if we need more of these. |
| return Math.log(x) / LOG_2; |
| } |
| |
| // --------------------------------- Classes --------------------------------- |
| |
| /** Delegates the {@link #score(int, float)} and |
| * {@link #explain(int, Explanation)} methods to |
| * {@link SimilarityBase#score(BasicStats, float, float)} and |
| * {@link SimilarityBase#explain(BasicStats, int, Explanation, float)}, |
| * respectively. |
| */ |
| private class BasicSimScorer extends SimScorer { |
| private final BasicStats stats; |
| private final NumericDocValues norms; |
| |
| BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException { |
| this.stats = stats; |
| this.norms = norms; |
| } |
| |
| @Override |
| public float score(int doc, float freq) { |
| // We have to supply something in case norms are omitted |
| return SimilarityBase.this.score(stats, freq, |
| norms == null ? 1F : decodeNormValue((byte)norms.get(doc))); |
| } |
| @Override |
| public Explanation explain(int doc, Explanation freq) { |
| return SimilarityBase.this.explain(stats, doc, freq, |
| norms == null ? 1F : decodeNormValue((byte)norms.get(doc))); |
| } |
| |
| @Override |
| public float computeSlopFactor(int distance) { |
| return 1.0f / (distance + 1); |
| } |
| |
| @Override |
| public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { |
| return 1f; |
| } |
| } |
| } |