| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.similarities; |
| |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.index.FieldInvertState; |
| import org.apache.lucene.index.IndexOptions; |
| import org.apache.lucene.search.CollectionStatistics; |
| import org.apache.lucene.search.Explanation; |
| import org.apache.lucene.search.TermStatistics; |
| import org.apache.lucene.util.SmallFloat; |
| |
| /** |
| * A subclass of {@code Similarity} that provides a simplified API for its |
| * descendants. Subclasses are only required to implement the {@link #score} |
| * and {@link #toString()} methods. Implementing |
| * {@link #explain(List, BasicStats, double, double)} is optional, |
| * inasmuch as SimilarityBase already provides a basic explanation of the score |
| * and the term frequency. However, implementers of a subclass are encouraged to |
| * include as much detail about the scoring method as possible. |
| * <p> |
| * Note: multi-word queries such as phrase queries are scored in a different way |
| * than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for |
| * the phrase as a whole (since it does not know it), this class instead scores |
| * phrases as a summation of the individual term scores. |
| * @lucene.experimental |
| */ |
| public abstract class SimilarityBase extends Similarity { |
| /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ |
| private static final double LOG_2 = Math.log(2); |
| |
| /** |
| * True if overlap tokens (tokens with a position of increment of zero) are |
| * discounted from the document's length. |
| */ |
| protected boolean discountOverlaps = true; |
| |
| /** |
| * Sole constructor. (For invocation by subclass |
| * constructors, typically implicit.) |
| */ |
| public SimilarityBase() {} |
| |
| /** Determines whether overlap tokens (Tokens with |
| * 0 position increment) are ignored when computing |
| * norm. By default this is true, meaning overlap |
| * tokens do not count when computing norms. |
| * |
| * @lucene.experimental |
| * |
| * @see #computeNorm |
| */ |
| public void setDiscountOverlaps(boolean v) { |
| discountOverlaps = v; |
| } |
| |
| /** |
| * Returns true if overlap tokens are discounted from the document's length. |
| * @see #setDiscountOverlaps |
| */ |
| public boolean getDiscountOverlaps() { |
| return discountOverlaps; |
| } |
| |
| @Override |
| public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { |
| SimScorer weights[] = new SimScorer[termStats.length]; |
| for (int i = 0; i < termStats.length; i++) { |
| BasicStats stats = newStats(collectionStats.field(), boost); |
| fillBasicStats(stats, collectionStats, termStats[i]); |
| weights[i] = new BasicSimScorer(stats); |
| } |
| if (weights.length == 1) { |
| return weights[0]; |
| } else { |
| return new MultiSimilarity.MultiSimScorer(weights); |
| } |
| } |
| |
| /** Factory method to return a custom stats object */ |
| protected BasicStats newStats(String field, double boost) { |
| return new BasicStats(field, boost); |
| } |
| |
| /** Fills all member fields defined in {@code BasicStats} in {@code stats}. |
| * Subclasses can override this method to fill additional stats. */ |
| protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { |
| // TODO: validate this for real, somewhere else |
| assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq(); |
| assert termStats.docFreq() <= collectionStats.sumDocFreq(); |
| |
| // TODO: add sumDocFreq for field (numberOfFieldPostings) |
| stats.setNumberOfDocuments(collectionStats.docCount()); |
| stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq()); |
| stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount()); |
| stats.setDocFreq(termStats.docFreq()); |
| stats.setTotalTermFreq(termStats.totalTermFreq()); |
| } |
| |
| /** |
| * Scores the document {@code doc}. |
| * <p>Subclasses must apply their scoring formula in this class.</p> |
| * @param stats the corpus level statistics. |
| * @param freq the term frequency. |
| * @param docLen the document length. |
| * @return the score. |
| */ |
| protected abstract double score(BasicStats stats, double freq, double docLen); |
| |
| /** |
| * Subclasses should implement this method to explain the score. {@code expl} |
| * already contains the score, the name of the class and the doc id, as well |
| * as the term frequency and its explanation; subclasses can add additional |
| * clauses to explain details of their scoring formulae. |
| * <p>The default implementation does nothing.</p> |
| * |
| * @param subExpls the list of details of the explanation to extend |
| * @param stats the corpus level statistics. |
| * @param freq the term frequency. |
| * @param docLen the document length. |
| */ |
| protected void explain( |
| List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {} |
| |
| /** |
| * Explains the score. The implementation here provides a basic explanation |
| * in the format <em>score(name-of-similarity, doc=doc-id, |
| * freq=term-frequency), computed from:</em>, and |
| * attaches the score (computed via the {@link #score(BasicStats, double, double)} |
| * method) and the explanation for the term frequency. Subclasses content with |
| * this format may add additional details in |
| * {@link #explain(List, BasicStats, double, double)}. |
| * |
| * @param stats the corpus level statistics. |
| * @param freq the term frequency and its explanation. |
| * @param docLen the document length. |
| * @return the explanation. |
| */ |
| protected Explanation explain( |
| BasicStats stats, Explanation freq, double docLen) { |
| List<Explanation> subs = new ArrayList<>(); |
| explain(subs, stats, freq.getValue().floatValue(), docLen); |
| |
| return Explanation.match( |
| (float) score(stats, freq.getValue().floatValue(), docLen), |
| "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:", |
| subs); |
| } |
| |
| /** |
| * Subclasses must override this method to return the name of the Similarity |
| * and preferably the values of parameters (if any) as well. |
| */ |
| @Override |
| public abstract String toString(); |
| |
| // ------------------------------ Norm handling ------------------------------ |
| |
| /** Cache of decoded bytes. */ |
| private static final float[] LENGTH_TABLE = new float[256]; |
| |
| static { |
| for (int i = 0; i < 256; i++) { |
| LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); |
| } |
| } |
| |
| /** Encodes the document length in the same way as {@link BM25Similarity}. */ |
| @Override |
| public final long computeNorm(FieldInvertState state) { |
| final int numTerms; |
| if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { |
| numTerms = state.getUniqueTermCount(); |
| } else if (discountOverlaps) { |
| numTerms = state.getLength() - state.getNumOverlap(); |
| } else { |
| numTerms = state.getLength(); |
| } |
| return SmallFloat.intToByte4(numTerms); |
| } |
| |
| // ----------------------------- Static methods ------------------------------ |
| |
| /** Returns the base two logarithm of {@code x}. */ |
| public static double log2(double x) { |
| // Put this to a 'util' class if we need more of these. |
| return Math.log(x) / LOG_2; |
| } |
| |
| // --------------------------------- Classes --------------------------------- |
| |
| /** Delegates the {@link #score(float, long)} and |
| * {@link #explain(Explanation, long)} methods to |
| * {@link SimilarityBase#score(BasicStats, double, double)} and |
| * {@link SimilarityBase#explain(BasicStats, Explanation, double)}, |
| * respectively. |
| */ |
| final class BasicSimScorer extends SimScorer { |
| final BasicStats stats; |
| |
| BasicSimScorer(BasicStats stats) { |
| this.stats = stats; |
| } |
| |
| double getLengthValue(long norm) { |
| return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; |
| } |
| |
| @Override |
| public float score(float freq, long norm) { |
| return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm)); |
| } |
| |
| @Override |
| public Explanation explain(Explanation freq, long norm) { |
| return SimilarityBase.this.explain(stats, freq, getLengthValue(norm)); |
| } |
| |
| } |
| } |