| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.similarities; |
| |
| import java.io.IOException; |
| import java.util.Random; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.LeafReader; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.search.CheckHits; |
| import org.apache.lucene.search.CollectionStatistics; |
| import org.apache.lucene.search.Explanation; |
| import org.apache.lucene.search.TermStatistics; |
| import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.SmallFloat; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.AfterClass; |
| import org.junit.BeforeClass; |
| |
| /** |
| * Abstract class to do basic tests for a similarity. |
| * NOTE: This test focuses on the similarity impl, nothing else. |
| * The [stretch] goal is for this test to be |
| * so thorough in testing a new Similarity that if this |
| * test passes, then all Lucene/Solr tests should also pass. Ie, |
| * if there is some bug in a given Similarity that this |
| * test fails to catch then this test needs to be improved! */ |
| public abstract class BaseSimilarityTestCase extends LuceneTestCase { |
| |
| static LeafReader READER; |
| static Directory DIR; |
| |
| @BeforeClass |
| public static void beforeClass() throws Exception { |
| // with norms |
| DIR = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), DIR); |
| Document doc = new Document(); |
| FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED); |
| fieldType.setOmitNorms(true); |
| doc.add(newField("field", "value", fieldType)); |
| writer.addDocument(doc); |
| READER = getOnlyLeafReader(writer.getReader()); |
| writer.close(); |
| } |
| |
| @AfterClass |
| public static void afterClass() throws Exception { |
| IOUtils.close(READER, DIR); |
| READER = null; |
| DIR = null; |
| } |
| |
| /** |
| * Return a new similarity with all parameters randomized within valid ranges. |
| */ |
| protected abstract Similarity getSimilarity(Random random); |
| |
| static final long MAXDOC_FORTESTING = 1L << 48; |
| // must be at least MAXDOC_FORTESTING + Integer.MAX_VALUE |
| static final long MAXTOKENS_FORTESTING = 1L << 49; |
| |
| /** |
| * returns a random corpus that is at least possible given |
| * the norm value for a single document. |
| */ |
| static CollectionStatistics newCorpus(Random random, int norm) { |
| // lower bound of tokens in the collection (you produced this norm somehow) |
| final int lowerBound; |
| if (norm == 0) { |
| // norms are omitted, but there must have been at least one token to produce that norm |
| lowerBound = 1; |
| } else { |
| // minimum value that would decode to such a norm |
| lowerBound = SmallFloat.byte4ToInt((byte) norm); |
| } |
| final long maxDoc; |
| switch (random.nextInt(6)) { |
| case 0: |
| // 1 doc collection |
| maxDoc = 1; |
| break; |
| case 1: |
| // 2 doc collection |
| maxDoc = 2; |
| break; |
| case 2: |
| // tiny collection |
| maxDoc = TestUtil.nextLong(random, 3, 16); |
| break; |
| case 3: |
| // small collection |
| maxDoc = TestUtil.nextLong(random, 16, 100000); |
| break; |
| case 4: |
| // big collection |
| maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING); |
| break; |
| default: |
| // yuge collection |
| maxDoc = MAXDOC_FORTESTING; |
| break; |
| } |
| final long docCount; |
| switch (random.nextInt(3)) { |
| case 0: |
| // sparsest field |
| docCount = 1; |
| break; |
| case 1: |
| // sparse field |
| docCount = TestUtil.nextLong(random, 1, maxDoc); |
| break; |
| default: |
| // fully populated |
| docCount = maxDoc; |
| break; |
| } |
| // random docsize: but can't require docs to have > 2B tokens |
| long upperBound; |
| try { |
| upperBound = Math.min(MAXTOKENS_FORTESTING, Math.multiplyExact(docCount, Integer.MAX_VALUE)); |
| } catch (ArithmeticException overflow) { |
| upperBound = MAXTOKENS_FORTESTING; |
| } |
| final long sumDocFreq; |
| switch (random.nextInt(3)) { |
| case 0: |
| // shortest possible docs |
| sumDocFreq = docCount; |
| break; |
| case 1: |
| // biggest possible docs |
| sumDocFreq = upperBound + 1 - lowerBound; |
| break; |
| default: |
| // random docsize |
| sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound); |
| break; |
| } |
| final long sumTotalTermFreq; |
| switch (random.nextInt(4)) { |
| case 0: |
| // term frequencies were omitted |
| sumTotalTermFreq = sumDocFreq; |
| break; |
| case 1: |
| // no repetition of terms (except to satisfy this norm) |
| sumTotalTermFreq = sumDocFreq - 1 + lowerBound; |
| break; |
| case 2: |
| // maximum repetition of terms |
| sumTotalTermFreq = upperBound; |
| break; |
| default: |
| // random repetition |
| assert sumDocFreq - 1 + lowerBound <= upperBound; |
| sumTotalTermFreq = TestUtil.nextLong(random, sumDocFreq - 1 + lowerBound, upperBound); |
| break; |
| } |
| return new CollectionStatistics("field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq); |
| } |
| |
| private static final BytesRef TERM = new BytesRef("term"); |
| |
| /** |
| * returns new random term, that fits within the bounds of the corpus |
| */ |
| static TermStatistics newTerm(Random random, CollectionStatistics corpus) { |
| final long docFreq; |
| switch (random.nextInt(3)) { |
| case 0: |
| // rare term |
| docFreq = 1; |
| break; |
| case 1: |
| // common term |
| docFreq = corpus.docCount(); |
| break; |
| default: |
| // random specificity |
| docFreq = TestUtil.nextLong(random, 1, corpus.docCount()); |
| break; |
| } |
| final long totalTermFreq; |
| // can't require docs to have > 2B tokens |
| long upperBound; |
| try { |
| upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE)); |
| } catch (ArithmeticException overflow) { |
| upperBound = corpus.sumTotalTermFreq(); |
| } |
| if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) { |
| // omitTF |
| totalTermFreq = docFreq; |
| } else { |
| switch (random.nextInt(3)) { |
| case 0: |
| // no repetition |
| totalTermFreq = docFreq; |
| break; |
| case 1: |
| // maximum repetition |
| totalTermFreq = upperBound; |
| break; |
| default: |
| // random repetition |
| totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound); |
| break; |
| } |
| } |
| return new TermStatistics(TERM, docFreq, totalTermFreq); |
| } |
| |
| /** |
| * Tests scoring across a bunch of random terms/corpora/frequencies for each possible document length. |
| * It does the following checks: |
| * <ul> |
| * <li>scores are non-negative and finite. |
| * <li>score matches the explanation exactly. |
| * <li>internal explanations calculations are sane (e.g. sum of: and so on actually compute sums) |
| * <li>scores don't decrease as term frequencies increase: e.g. score(freq=N + 1) >= score(freq=N) |
| * <li>scores don't decrease as documents get shorter, e.g. score(len=M) >= score(len=M+1) |
| * <li>scores don't decrease as terms get rarer, e.g. score(term=N) >= score(term=N+1) |
| * <li>scoring works for floating point frequencies (e.g. sloppy phrase and span queries will work) |
| * <li>scoring works for reasonably large 64-bit statistic values (e.g. distributed search will work) |
| * <li>scoring works for reasonably large boost values (0 .. Integer.MAX_VALUE, e.g. query boosts will work) |
| * <li>scoring works for parameters randomized within valid ranges (see {@link #getSimilarity(Random)}) |
| * </ul> |
| */ |
| public void testRandomScoring() throws Exception { |
| Random random = random(); |
| final int iterations = atLeast(1); |
| for (int i = 0; i < iterations; i++) { |
| // pull a new similarity to switch up parameters |
| Similarity similarity = getSimilarity(random); |
| for (int j = 0; j < 3; j++) { |
| // for each norm value... |
| for (int k = 1; k < 256; k++) { |
| CollectionStatistics corpus = newCorpus(random, k); |
| for (int l = 0; l < 10; l++) { |
| TermStatistics term = newTerm(random, corpus); |
| final float freq; |
| if (term.totalTermFreq() == term.docFreq()) { |
| // omit TF |
| freq = 1; |
| } else if (term.docFreq() == 1) { |
| // only one document, all the instances must be here. |
| freq = Math.toIntExact(term.totalTermFreq()); |
| } else { |
| // there is at least one other document, and those must have at least 1 instance each. |
| int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE)); |
| if (random.nextBoolean()) { |
| // integer freq |
| switch (random.nextInt(3)) { |
| case 0: |
| // smallest freq |
| freq = 1; |
| break; |
| case 1: |
| // largest freq |
| freq = upperBound; |
| break; |
| default: |
| // random freq |
| freq = TestUtil.nextInt(random, 1, upperBound); |
| break; |
| } |
| } else { |
| // float freq |
| float freqCandidate; |
| switch (random.nextInt(2)) { |
| case 0: |
| // smallest freq |
| freqCandidate = Float.MIN_VALUE; |
| break; |
| default: |
| // random freq |
| freqCandidate = upperBound * random.nextFloat(); |
| break; |
| } |
| // we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case. |
| // this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc) |
| if (freqCandidate <= Float.MIN_VALUE) { |
| freqCandidate = Math.nextUp(Float.MIN_VALUE); |
| } |
| freq = freqCandidate; |
| } |
| } |
| // we just limit the test to "reasonable" boost values but don't enforce this anywhere. |
| // too big, and you are asking for overflow. that's hard for a sim to enforce (but definitely possible) |
| // for now, we just want to detect overflow where its a real bug/hazard in the computation with reasonable inputs. |
| final float boost; |
| switch (random.nextInt(5)) { |
| case 0: |
| // minimum value (not enforced) |
| boost = 0F; |
| break; |
| case 1: |
| // tiny value |
| boost = Float.MIN_VALUE; |
| break; |
| case 2: |
| // no-op value (sometimes treated special in explanations) |
| boost = 1F; |
| break; |
| case 3: |
| // maximum value (not enforceD) |
| boost = Integer.MAX_VALUE; |
| break; |
| default: |
| // random value |
| boost = random.nextFloat() * Integer.MAX_VALUE; |
| break; |
| } |
| doTestScoring(similarity, corpus, term, boost, freq, k); |
| } |
| } |
| } |
| } |
| } |
| |
| /** runs for a single test case, so that if you hit a test failure you can write a reproducer just for that scenario */ |
| private static void doTestScoring(Similarity similarity, CollectionStatistics corpus, TermStatistics term, float boost, float freq, int norm) throws IOException { |
| boolean success = false; |
| SimScorer scorer = similarity.scorer(boost, corpus, term); |
| try { |
| float maxScore = scorer.score(Float.MAX_VALUE, 1); |
| assertFalse("maxScore is NaN", Float.isNaN(maxScore)); |
| |
| float score = scorer.score(freq, norm); |
| // check that score isn't infinite or negative |
| assertTrue("infinite/NaN score: " + score, Float.isFinite(score)); |
| if (!(similarity instanceof IndriDirichletSimilarity)) { |
| assertTrue("negative score: " + score, score >= 0); |
| } |
| assertTrue("greater than maxScore: " + score + ">" + maxScore, score <= maxScore); |
| // check explanation matches |
| Explanation explanation = scorer.explain(Explanation.match(freq, "freq, occurrences of term within document"), norm); |
| if (score != explanation.getValue().doubleValue()) { |
| fail("expected: " + score + ", got: " + explanation); |
| } |
| if (rarely()) { |
| CheckHits.verifyExplanation("<test query>", 0, score, true, explanation); |
| } |
| |
| // check score(freq-1), given the same norm it should be <= score(freq) [scores non-decreasing for more term occurrences] |
| final float prevFreq; |
| if (random().nextBoolean() && freq == (int)freq && freq > 1 && term.docFreq() > 1) { |
| // previous in integer space |
| prevFreq = freq - 1; |
| } else { |
| // previous in float space (e.g. for sloppyPhrase) |
| prevFreq = Math.nextDown(freq); |
| } |
| |
| float prevScore = scorer.score(prevFreq, norm); |
| // check that score isn't infinite or negative |
| assertTrue(Float.isFinite(prevScore)); |
| if (!(similarity instanceof IndriDirichletSimilarity)) { |
| assertTrue(prevScore >= 0); |
| } |
| // check explanation matches |
| Explanation prevExplanation = scorer.explain(Explanation.match(prevFreq, "freq, occurrences of term within document"), norm); |
| if (prevScore != prevExplanation.getValue().doubleValue()) { |
| fail("expected: " + prevScore + ", got: " + prevExplanation); |
| } |
| if (rarely()) { |
| CheckHits.verifyExplanation("test query (prevFreq)", 0, prevScore, true, prevExplanation); |
| } |
| |
| if (prevScore > score) { |
| System.out.println(prevExplanation); |
| System.out.println(explanation); |
| fail("score(" + prevFreq + ")=" + prevScore + " > score(" + freq + ")=" + score); |
| } |
| |
| // check score(norm-1), given the same freq it should be >= score(norm) [scores non-decreasing as docs get shorter] |
| if (norm > 1) { |
| float prevNormScore = scorer.score(freq, norm - 1); |
| // check that score isn't infinite or negative |
| assertTrue(Float.isFinite(prevNormScore)); |
| if (!(similarity instanceof IndriDirichletSimilarity)) { |
| assertTrue(prevNormScore >= 0); |
| } |
| // check explanation matches |
| Explanation prevNormExplanation = scorer.explain(Explanation.match(freq, "freq, occurrences of term within document"), norm - 1); |
| if (prevNormScore != prevNormExplanation.getValue().doubleValue()) { |
| fail("expected: " + prevNormScore + ", got: " + prevNormExplanation); |
| } |
| if (rarely()) { |
| CheckHits.verifyExplanation("test query (prevNorm)", 0, prevNormScore, true, prevNormExplanation); |
| } |
| if (prevNormScore < score) { |
| System.out.println(prevNormExplanation); |
| System.out.println(explanation); |
| fail("score(" + freq + "," + (norm-1) + ")=" + prevNormScore + " < score(" + freq + "," + norm + ")=" + score); |
| } |
| } |
| |
| // check score(term-1), given the same freq/norm it should be >= score(term) [scores non-decreasing as terms get rarer] |
| if (term.docFreq() > 1 && freq < term.totalTermFreq()) { |
| TermStatistics prevTerm = new TermStatistics(term.term(), term.docFreq() - 1, term.totalTermFreq() - 1); |
| SimScorer prevTermScorer = similarity.scorer(boost, corpus, term); |
| float prevTermScore = prevTermScorer.score(freq, norm); |
| // check that score isn't infinite or negative |
| assertTrue(Float.isFinite(prevTermScore)); |
| if (!(similarity instanceof IndriDirichletSimilarity)) { |
| assertTrue(prevTermScore >= 0); |
| } |
| // check explanation matches |
| Explanation prevTermExplanation = prevTermScorer.explain(Explanation.match(freq, "freq, occurrences of term within document"), norm); |
| if (prevTermScore != prevTermExplanation.getValue().doubleValue()) { |
| fail("expected: " + prevTermScore + ", got: " + prevTermExplanation); |
| } |
| if (rarely()) { |
| CheckHits.verifyExplanation("test query (prevTerm)", 0, prevTermScore, true, prevTermExplanation); |
| } |
| |
| if (prevTermScore < score) { |
| System.out.println(prevTermExplanation); |
| System.out.println(explanation); |
| fail("score(" + freq + "," + (prevTerm) + ")=" + prevTermScore + " < score(" + freq + "," + term + ")=" + score); |
| } |
| } |
| |
| success = true; |
| } finally { |
| if (!success) { |
| System.out.println(similarity); |
| System.out.println(corpus); |
| System.out.println(term); |
| if (norm == 0) { |
| System.out.println("norms=omitted"); |
| } else { |
| System.out.println("norm=" + norm + " (doc length ~ " + SmallFloat.byte4ToInt((byte) norm) + ")"); |
| } |
| System.out.println("freq=" + freq); |
| } |
| } |
| } |
| } |