docs/attachments/LUCENE-8015/LUCENE-8015_test_fangs.patch - lucene-jira-archive - Git at Google

 diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
 index d93594d..a5e0a20 100644
 --- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
 +++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
 @@ -193,21 +193,47 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
        lowerBound = SmallFloat.byte4ToInt((byte) norm);
      }
      final long maxDoc;
 -    if (random.nextBoolean()) {
 -      // small collection
 -      maxDoc = TestUtil.nextLong(random, 1, 100000);
 -    } else {
 -      // yuge collection
 -      maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
 +    switch (random.nextInt(6)) {
 +      case 0:
 +        // 1 doc collection
 +        maxDoc = 1;
 +        break;
 +      case 1:
 +        // 2 doc collection
 +        maxDoc = 2;
 +        break;
 +      case 2:
 +        // tiny collection
 +        maxDoc = TestUtil.nextLong(random, 3, 16);
 +        break;
 +      case 3:
 +        // small collection
 +        maxDoc = TestUtil.nextLong(random, 16, 100000);
 +        break;
 +      case 4:
 +        // big collection
 +        maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING);
 +        break;
 +      default:
 +        // yuge collection
 +        maxDoc = MAXDOC_FORTESTING;
 +        break;
      }
      // TODO: make this a mandatory statistic, or test it with -1
      final long docCount;
 -    if (random.nextBoolean()) {
 -      // sparse field
 -      docCount = TestUtil.nextLong(random, 1, maxDoc);
 -    } else {
 -      // fully populated
 -      docCount = maxDoc;
 +    switch (random.nextInt(3)) {
 +      case 0:
 +        // sparsest field
 +        docCount = 1;
 +        break;
 +      case 1:
 +        // sparse field
 +        docCount = TestUtil.nextLong(random, 1, maxDoc);
 +        break;
 +      default:
 +        // fully populated
 +        docCount = maxDoc;
 +        break;
      }
      // random docsize: but can't require docs to have > 2B tokens
      long upperBound;
 @@ -218,15 +244,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
      }
      // TODO: make this a mandatory statistic, or test it with -1
      final long sumDocFreq;
 -    if (random.nextBoolean()) {
 -      // shortest possible docs
 -      sumDocFreq = docCount;
 -    } else {
 -      // random docsize
 -      sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
 +    switch (random.nextInt(3)) {
 +      case 0:
 +        // shortest possible docs
 +        sumDocFreq = docCount;
 +        break;
 +      case 1:
 +        // biggest possible docs
 +        sumDocFreq = upperBound + 1 - lowerBound;
 +        break;
 +      default:
 +        // random docsize
 +        sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
 +        break;
      }
      final long sumTotalTermFreq;
 -    switch (random.nextInt(3)) {
 +    switch (random.nextInt(4)) {
        case 0:
          // unsupported (e.g. omitTF)
          sumTotalTermFreq = -1;
 @@ -235,6 +268,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
          // no repetition of terms (except to satisfy this norm)
          sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
          break;
 +      case 2:
 +        // maximum repetition of terms
 +        sumTotalTermFreq = upperBound;
 +        break;
        default:
          // random repetition
          assert sumDocFreq - 1 + lowerBound <= upperBound;
 @@ -251,29 +288,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
     */
    static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
      final long docFreq;
 -    if (random.nextBoolean()) {
 -      // rare term
 -      docFreq = 1;
 -    } else {
 -      // random specificity
 -      docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
 +    switch (random.nextInt(3)) {
 +      case 0:
 +        // rare term
 +        docFreq = 1;
 +        break;
 +      case 1:
 +        // common term
 +        docFreq = corpus.docCount();
 +        break;
 +      default:
 +        // random specificity
 +        docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
 +        break;
      }
      final long totalTermFreq;
 +    // can't require docs to have > 2B tokens
 +    long upperBound;
 +    try {
 +      upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
 +    } catch (ArithmeticException overflow) {
 +      upperBound = corpus.sumTotalTermFreq();
 +    }
      if (corpus.sumTotalTermFreq() == -1) {
        // omitTF
        totalTermFreq = -1;
 -    } else if (random.nextBoolean()) {
 -      // no repetition
 -      totalTermFreq = docFreq;
      } else {
 -      // random repetition: but can't require docs to have > 2B tokens
 -      long upperBound;
 -      try {
 -        upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
 -      } catch (ArithmeticException overflow) {
 -        upperBound = corpus.sumTotalTermFreq();
 +      switch (random.nextInt(3)) {
 +        case 0:
 +          // no repetition
 +          totalTermFreq = docFreq;
 +          break;
 +        case 1:
 +          // maximum repetition
 +          totalTermFreq = upperBound;
 +          break;
 +        default:
 +          // random repetition
 +          totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
 +          break;
        }
 -      totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
      }
      return new TermStatistics(TERM, docFreq, totalTermFreq);
    }
 @@ -317,9 +371,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
                // there is at least one other document, and those must have at least 1 instance each.
                int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
                if (random.nextBoolean()) {
 -                freq = TestUtil.nextInt(random, 1, upperBound);
 +                // integer freq
 +                switch (random.nextInt(3)) {
 +                  case 0:
 +                    // smallest freq
 +                    freq = 1;
 +                    break;
 +                  case 1:
 +                    // largest freq
 +                    freq = upperBound;
 +                    break;
 +                  default:
 +                    // random freq
 +                    freq = TestUtil.nextInt(random, 1, upperBound);
 +                    break;
 +                }
                } else {
 -                float freqCandidate = upperBound * random.nextFloat();
 +                // float freq
 +                float freqCandidate;
 +                switch (random.nextInt(2)) {
 +                  case 0:
 +                    // smallest freq
 +                    freqCandidate = Float.MIN_VALUE;
 +                    break;
 +                  default:
 +                    // random freq
 +                    freqCandidate = upperBound * random.nextFloat();
 +                    break;
 +                }
                  // we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
                  // this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
                  if (freqCandidate <= Float.MIN_VALUE) {
	diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
	index d93594d..a5e0a20 100644
	--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
	+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
	@@ -193,21 +193,47 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
	lowerBound = SmallFloat.byte4ToInt((byte) norm);
	}
	final long maxDoc;
	- if (random.nextBoolean()) {
	- // small collection
	- maxDoc = TestUtil.nextLong(random, 1, 100000);
	- } else {
	- // yuge collection
	- maxDoc = TestUtil.nextLong(random, 1, MAXDOC_FORTESTING);
	+ switch (random.nextInt(6)) {
	+ case 0:
	+ // 1 doc collection
	+ maxDoc = 1;
	+ break;
	+ case 1:
	+ // 2 doc collection
	+ maxDoc = 2;
	+ break;
	+ case 2:
	+ // tiny collection
	+ maxDoc = TestUtil.nextLong(random, 3, 16);
	+ break;
	+ case 3:
	+ // small collection
	+ maxDoc = TestUtil.nextLong(random, 16, 100000);
	+ break;
	+ case 4:
	+ // big collection
	+ maxDoc = TestUtil.nextLong(random, 100000, MAXDOC_FORTESTING);
	+ break;
	+ default:
	+ // yuge collection
	+ maxDoc = MAXDOC_FORTESTING;
	+ break;
	}
	// TODO: make this a mandatory statistic, or test it with -1
	final long docCount;
	- if (random.nextBoolean()) {
	- // sparse field
	- docCount = TestUtil.nextLong(random, 1, maxDoc);
	- } else {
	- // fully populated
	- docCount = maxDoc;
	+ switch (random.nextInt(3)) {
	+ case 0:
	+ // sparsest field
	+ docCount = 1;
	+ break;
	+ case 1:
	+ // sparse field
	+ docCount = TestUtil.nextLong(random, 1, maxDoc);
	+ break;
	+ default:
	+ // fully populated
	+ docCount = maxDoc;
	+ break;
	}
	// random docsize: but can't require docs to have > 2B tokens
	long upperBound;
	@@ -218,15 +244,22 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
	}
	// TODO: make this a mandatory statistic, or test it with -1
	final long sumDocFreq;
	- if (random.nextBoolean()) {
	- // shortest possible docs
	- sumDocFreq = docCount;
	- } else {
	- // random docsize
	- sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
	+ switch (random.nextInt(3)) {
	+ case 0:
	+ // shortest possible docs
	+ sumDocFreq = docCount;
	+ break;
	+ case 1:
	+ // biggest possible docs
	+ sumDocFreq = upperBound + 1 - lowerBound;
	+ break;
	+ default:
	+ // random docsize
	+ sumDocFreq = TestUtil.nextLong(random, docCount, upperBound + 1 - lowerBound);
	+ break;
	}
	final long sumTotalTermFreq;
	- switch (random.nextInt(3)) {
	+ switch (random.nextInt(4)) {
	case 0:
	// unsupported (e.g. omitTF)
	sumTotalTermFreq = -1;
	@@ -235,6 +268,10 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
	// no repetition of terms (except to satisfy this norm)
	sumTotalTermFreq = sumDocFreq - 1 + lowerBound;
	break;
	+ case 2:
	+ // maximum repetition of terms
	+ sumTotalTermFreq = upperBound;
	+ break;
	default:
	// random repetition
	assert sumDocFreq - 1 + lowerBound <= upperBound;
	@@ -251,29 +288,46 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
	*/
	static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
	final long docFreq;
	- if (random.nextBoolean()) {
	- // rare term
	- docFreq = 1;
	- } else {
	- // random specificity
	- docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
	+ switch (random.nextInt(3)) {
	+ case 0:
	+ // rare term
	+ docFreq = 1;
	+ break;
	+ case 1:
	+ // common term
	+ docFreq = corpus.docCount();
	+ break;
	+ default:
	+ // random specificity
	+ docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
	+ break;
	}
	final long totalTermFreq;
	+ // can't require docs to have > 2B tokens
	+ long upperBound;
	+ try {
	+ upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
	+ } catch (ArithmeticException overflow) {
	+ upperBound = corpus.sumTotalTermFreq();
	+ }
	if (corpus.sumTotalTermFreq() == -1) {
	// omitTF
	totalTermFreq = -1;
	- } else if (random.nextBoolean()) {
	- // no repetition
	- totalTermFreq = docFreq;
	} else {
	- // random repetition: but can't require docs to have > 2B tokens
	- long upperBound;
	- try {
	- upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
	- } catch (ArithmeticException overflow) {
	- upperBound = corpus.sumTotalTermFreq();
	+ switch (random.nextInt(3)) {
	+ case 0:
	+ // no repetition
	+ totalTermFreq = docFreq;
	+ break;
	+ case 1:
	+ // maximum repetition
	+ totalTermFreq = upperBound;
	+ break;
	+ default:
	+ // random repetition
	+ totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
	+ break;
	}
	- totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
	}
	return new TermStatistics(TERM, docFreq, totalTermFreq);
	}
	@@ -317,9 +371,34 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
	// there is at least one other document, and those must have at least 1 instance each.
	int upperBound = Math.toIntExact(Math.min(term.totalTermFreq() - term.docFreq() + 1, Integer.MAX_VALUE));
	if (random.nextBoolean()) {
	- freq = TestUtil.nextInt(random, 1, upperBound);
	+ // integer freq
	+ switch (random.nextInt(3)) {
	+ case 0:
	+ // smallest freq
	+ freq = 1;
	+ break;
	+ case 1:
	+ // largest freq
	+ freq = upperBound;
	+ break;
	+ default:
	+ // random freq
	+ freq = TestUtil.nextInt(random, 1, upperBound);
	+ break;
	+ }
	} else {
	- float freqCandidate = upperBound * random.nextFloat();
	+ // float freq
	+ float freqCandidate;
	+ switch (random.nextInt(2)) {
	+ case 0:
	+ // smallest freq
	+ freqCandidate = Float.MIN_VALUE;
	+ break;
	+ default:
	+ // random freq
	+ freqCandidate = upperBound * random.nextFloat();
	+ break;
	+ }
	// we need to be 2nd float value at a minimum, the pairwise test will check MIN_VALUE in this case.
	// this avoids testing frequencies of 0 which seem wrong to allow (we should enforce computeSlopFactor etc)
	if (freqCandidate <= Float.MIN_VALUE) {