lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search;

 import java.util.BitSet;
 import java.util.Random;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;

 /**
  * Simple base class for checking search equivalence.
  * Extend it, and write tests that create {@link #randomTerm()}s
  * (all terms are single characters a-z), and use
  * {@link #assertSameSet(Query, Query)} and
  * {@link #assertSubsetOf(Query, Query)}
  */
 @SuppressCodecs("SimpleText")
 public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
   protected static IndexSearcher s1, s2;
   protected static Directory directory;
   protected static IndexReader reader;
   protected static Analyzer analyzer;
   protected static String stopword; // we always pick a character as a stopword

   @BeforeClass
   public static void beforeClass() throws Exception {
     Random random = random();
     directory = newDirectory();
     stopword = "" + randomChar();
     CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
     analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
     RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
     Document doc = new Document();
     Field id = new StringField("id", "", Field.Store.NO);
     Field field = new TextField("field", "", Field.Store.NO);
     doc.add(id);
     doc.add(field);

     // index some docs
     int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
     for (int i = 0; i < numDocs; i++) {
       id.setStringValue(Integer.toString(i));
       field.setStringValue(randomFieldContents());
       iw.addDocument(doc);
     }

     // delete some docs
     int numDeletes = numDocs/20;
     for (int i = 0; i < numDeletes; i++) {
       Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
       if (random.nextBoolean()) {
         iw.deleteDocuments(toDelete);
       } else {
         iw.deleteDocuments(new TermQuery(toDelete));
       }
     }

     reader = iw.getReader();
     s1 = newSearcher(reader);
     s2 = newSearcher(reader);
     iw.close();
   }

   @AfterClass
   public static void afterClass() throws Exception {
     reader.close();
     directory.close();
     analyzer.close();
     reader = null;
     directory = null;
     analyzer = null;
     s1 = s2 = null;
   }

   /**
    * populate a field with random contents.
    * terms should be single characters in lowercase (a-z)
    * tokenization can be assumed to be on whitespace.
    */
   static String randomFieldContents() {
     // TODO: zipf-like distribution
     StringBuilder sb = new StringBuilder();
     int numTerms = random().nextInt(15);
     for (int i = 0; i < numTerms; i++) {
       if (sb.length() > 0) {
         sb.append(' '); // whitespace
       }
       sb.append(randomChar());
     }
     return sb.toString();
   }

   /**
    * returns random character (a-z)
    */
   static char randomChar() {
     return (char) TestUtil.nextInt(random(), 'a', 'z');
   }

   /**
    * returns a term suitable for searching.
    * terms are single characters in lowercase (a-z)
    */
   protected Term randomTerm() {
     return new Term("field", "" + randomChar());
   }

   /**
    * Returns a random filter over the document set
    */
   protected Query randomFilter() {
     final Query query;
     if (random().nextBoolean()) {
       query = TermRangeQuery.newStringRange("field", "a", "" + randomChar(), true, true);
     } else {
       // use a query with a two-phase approximation
       PhraseQuery phrase = new PhraseQuery(100, "field", "" + randomChar(), "" + randomChar());
       query = phrase;
     }
     return query;
   }

   /**
    * Asserts that the documents returned by <code>q1</code>
    * are the same as of those returned by <code>q2</code>
    */
   public void assertSameSet(Query q1, Query q2) throws Exception {
     assertSubsetOf(q1, q2);
     assertSubsetOf(q2, q1);
   }

   /**
    * Asserts that the documents returned by <code>q1</code>
    * are a subset of those returned by <code>q2</code>
    */
   public void assertSubsetOf(Query q1, Query q2) throws Exception {
     // test without a filter
     assertSubsetOf(q1, q2, null);

     // test with some filters (this will sometimes cause advance'ing enough to test it)
     int numFilters = TEST_NIGHTLY ? atLeast(10) : atLeast(3);
     for (int i = 0; i < numFilters; i++) {
       Query filter = randomFilter();
       // incorporate the filter in different ways.
       assertSubsetOf(q1, q2, filter);
       assertSubsetOf(filteredQuery(q1, filter), filteredQuery(q2, filter), null);
     }
   }

   /**
    * Asserts that the documents returned by <code>q1</code>
    * are a subset of those returned by <code>q2</code>.
    *
    * Both queries will be filtered by <code>filter</code>
    */
   protected void assertSubsetOf(Query q1, Query q2, Query filter) throws Exception {
     QueryUtils.check(q1);
     QueryUtils.check(q2);

     if (filter != null) {
       q1 = new BooleanQuery.Builder()
           .add(q1, Occur.MUST)
           .add(filter, Occur.FILTER)
           .build();
       q2 = new BooleanQuery.Builder()
           .add(q2, Occur.MUST)
           .add(filter, Occur.FILTER)
           .build();
     }
     // we test both INDEXORDER and RELEVANCE because we want to test needsScores=true/false
     for (Sort sort : new Sort[] { Sort.INDEXORDER, Sort.RELEVANCE }) {
       // not efficient, but simple!
       TopDocs td1 = s1.search(q1, reader.maxDoc(), sort);
       TopDocs td2 = s2.search(q2, reader.maxDoc(), sort);
       assertTrue("too many hits: " + td1.totalHits.value + " > " + td2.totalHits.value, td1.totalHits.value <= td2.totalHits.value);

       // fill the superset into a bitset
       BitSet bitset = new BitSet();
       for (int i = 0; i < td2.scoreDocs.length; i++) {
         bitset.set(td2.scoreDocs[i].doc);
       }

       // check in the subset, that every bit was set by the super
       for (int i = 0; i < td1.scoreDocs.length; i++) {
         assertTrue(bitset.get(td1.scoreDocs[i].doc));
       }
     }
   }

   /**
    * Assert that two queries return the same documents and with the same scores.
    */
   protected void assertSameScores(Query q1, Query q2) throws Exception {
     assertSameSet(q1, q2);

     assertSameScores(q1, q2, null);
     // also test with some filters to test advancing
     int numFilters = TEST_NIGHTLY ? atLeast(10) : atLeast(3);
     for (int i = 0; i < numFilters; i++) {
       Query filter = randomFilter();
       // incorporate the filter in different ways.
       assertSameScores(q1, q2, filter);
       assertSameScores(filteredQuery(q1, filter), filteredQuery(q2, filter), null);
     }
   }

   protected void assertSameScores(Query q1, Query q2, Query filter) throws Exception {
     // not efficient, but simple!
     if (filter != null) {
       q1 = new BooleanQuery.Builder()
           .add(q1, Occur.MUST)
           .add(filter, Occur.FILTER)
           .build();
       q2 = new BooleanQuery.Builder()
           .add(q2, Occur.MUST)
           .add(filter, Occur.FILTER)
           .build();
     }
     TopDocs td1 = s1.search(q1, reader.maxDoc());
     TopDocs td2 = s2.search(q2, reader.maxDoc());
     assertEquals(td1.totalHits.value, td2.totalHits.value);
     for (int i = 0; i < td1.scoreDocs.length; ++i) {
       assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc);
       assertEquals(td1.scoreDocs[i].score, td2.scoreDocs[i].score, 10e-5);
     }
   }

   protected Query filteredQuery(Query query, Query filter) {
     return new BooleanQuery.Builder()
         .add(query, Occur.MUST)
         .add(filter, Occur.FILTER)
         .build();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search;

	import java.util.BitSet;
	import java.util.Random;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.RandomIndexWriter;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.BooleanClause.Occur;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
	import org.apache.lucene.util.TestUtil;
	import org.apache.lucene.util.automaton.Automata;
	import org.apache.lucene.util.automaton.CharacterRunAutomaton;
	import org.junit.AfterClass;
	import org.junit.BeforeClass;

	/**
	* Simple base class for checking search equivalence.
	* Extend it, and write tests that create {@link #randomTerm()}s
	* (all terms are single characters a-z), and use
	* {@link #assertSameSet(Query, Query)} and
	* {@link #assertSubsetOf(Query, Query)}
	*/
	@SuppressCodecs("SimpleText")
	public abstract class SearchEquivalenceTestBase extends LuceneTestCase {
	protected static IndexSearcher s1, s2;
	protected static Directory directory;
	protected static IndexReader reader;
	protected static Analyzer analyzer;
	protected static String stopword; // we always pick a character as a stopword

	@BeforeClass
	public static void beforeClass() throws Exception {
	Random random = random();
	directory = newDirectory();
	stopword = "" + randomChar();
	CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword));
	analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
	RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
	Document doc = new Document();
	Field id = new StringField("id", "", Field.Store.NO);
	Field field = new TextField("field", "", Field.Store.NO);
	doc.add(id);
	doc.add(field);

	// index some docs
	int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
	for (int i = 0; i < numDocs; i++) {
	id.setStringValue(Integer.toString(i));
	field.setStringValue(randomFieldContents());
	iw.addDocument(doc);
	}

	// delete some docs
	int numDeletes = numDocs/20;
	for (int i = 0; i < numDeletes; i++) {
	Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs)));
	if (random.nextBoolean()) {
	iw.deleteDocuments(toDelete);
	} else {
	iw.deleteDocuments(new TermQuery(toDelete));
	}
	}

	reader = iw.getReader();
	s1 = newSearcher(reader);
	s2 = newSearcher(reader);
	iw.close();
	}

	@AfterClass
	public static void afterClass() throws Exception {
	reader.close();
	directory.close();
	analyzer.close();
	reader = null;
	directory = null;
	analyzer = null;
	s1 = s2 = null;
	}

	/**
	* populate a field with random contents.
	* terms should be single characters in lowercase (a-z)
	* tokenization can be assumed to be on whitespace.
	*/
	static String randomFieldContents() {
	// TODO: zipf-like distribution
	StringBuilder sb = new StringBuilder();
	int numTerms = random().nextInt(15);
	for (int i = 0; i < numTerms; i++) {
	if (sb.length() > 0) {
	sb.append(' '); // whitespace
	}
	sb.append(randomChar());
	}
	return sb.toString();
	}

	/**
	* returns random character (a-z)
	*/
	static char randomChar() {
	return (char) TestUtil.nextInt(random(), 'a', 'z');
	}

	/**
	* returns a term suitable for searching.
	* terms are single characters in lowercase (a-z)
	*/
	protected Term randomTerm() {
	return new Term("field", "" + randomChar());
	}

	/**
	* Returns a random filter over the document set
	*/
	protected Query randomFilter() {
	final Query query;
	if (random().nextBoolean()) {
	query = TermRangeQuery.newStringRange("field", "a", "" + randomChar(), true, true);
	} else {
	// use a query with a two-phase approximation
	PhraseQuery phrase = new PhraseQuery(100, "field", "" + randomChar(), "" + randomChar());
	query = phrase;
	}
	return query;
	}

	/**
	* Asserts that the documents returned by <code>q1</code>
	* are the same as of those returned by <code>q2</code>
	*/
	public void assertSameSet(Query q1, Query q2) throws Exception {
	assertSubsetOf(q1, q2);
	assertSubsetOf(q2, q1);
	}

	/**
	* Asserts that the documents returned by <code>q1</code>
	* are a subset of those returned by <code>q2</code>
	*/
	public void assertSubsetOf(Query q1, Query q2) throws Exception {
	// test without a filter
	assertSubsetOf(q1, q2, null);

	// test with some filters (this will sometimes cause advance'ing enough to test it)
	int numFilters = TEST_NIGHTLY ? atLeast(10) : atLeast(3);
	for (int i = 0; i < numFilters; i++) {
	Query filter = randomFilter();
	// incorporate the filter in different ways.
	assertSubsetOf(q1, q2, filter);
	assertSubsetOf(filteredQuery(q1, filter), filteredQuery(q2, filter), null);
	}
	}

	/**
	* Asserts that the documents returned by <code>q1</code>
	* are a subset of those returned by <code>q2</code>.
	*
	* Both queries will be filtered by <code>filter</code>
	*/
	protected void assertSubsetOf(Query q1, Query q2, Query filter) throws Exception {
	QueryUtils.check(q1);
	QueryUtils.check(q2);

	if (filter != null) {
	q1 = new BooleanQuery.Builder()
	.add(q1, Occur.MUST)
	.add(filter, Occur.FILTER)
	.build();
	q2 = new BooleanQuery.Builder()
	.add(q2, Occur.MUST)
	.add(filter, Occur.FILTER)
	.build();
	}
	// we test both INDEXORDER and RELEVANCE because we want to test needsScores=true/false
	for (Sort sort : new Sort[] { Sort.INDEXORDER, Sort.RELEVANCE }) {
	// not efficient, but simple!
	TopDocs td1 = s1.search(q1, reader.maxDoc(), sort);
	TopDocs td2 = s2.search(q2, reader.maxDoc(), sort);
	assertTrue("too many hits: " + td1.totalHits.value + " > " + td2.totalHits.value, td1.totalHits.value <= td2.totalHits.value);

	// fill the superset into a bitset
	BitSet bitset = new BitSet();
	for (int i = 0; i < td2.scoreDocs.length; i++) {
	bitset.set(td2.scoreDocs[i].doc);
	}

	// check in the subset, that every bit was set by the super
	for (int i = 0; i < td1.scoreDocs.length; i++) {
	assertTrue(bitset.get(td1.scoreDocs[i].doc));
	}
	}
	}

	/**
	* Assert that two queries return the same documents and with the same scores.
	*/
	protected void assertSameScores(Query q1, Query q2) throws Exception {
	assertSameSet(q1, q2);

	assertSameScores(q1, q2, null);
	// also test with some filters to test advancing
	int numFilters = TEST_NIGHTLY ? atLeast(10) : atLeast(3);
	for (int i = 0; i < numFilters; i++) {
	Query filter = randomFilter();
	// incorporate the filter in different ways.
	assertSameScores(q1, q2, filter);
	assertSameScores(filteredQuery(q1, filter), filteredQuery(q2, filter), null);
	}
	}

	protected void assertSameScores(Query q1, Query q2, Query filter) throws Exception {
	// not efficient, but simple!
	if (filter != null) {
	q1 = new BooleanQuery.Builder()
	.add(q1, Occur.MUST)
	.add(filter, Occur.FILTER)
	.build();
	q2 = new BooleanQuery.Builder()
	.add(q2, Occur.MUST)
	.add(filter, Occur.FILTER)
	.build();
	}
	TopDocs td1 = s1.search(q1, reader.maxDoc());
	TopDocs td2 = s2.search(q2, reader.maxDoc());
	assertEquals(td1.totalHits.value, td2.totalHits.value);
	for (int i = 0; i < td1.scoreDocs.length; ++i) {
	assertEquals(td1.scoreDocs[i].doc, td2.scoreDocs[i].doc);
	assertEquals(td1.scoreDocs[i].score, td2.scoreDocs[i].score, 10e-5);
	}
	}

	protected Query filteredQuery(Query query, Query filter) {
	return new BooleanQuery.Builder()
	.add(query, Occur.MUST)
	.add(filter, Occur.FILTER)
	.build();
	}
	}