blob: cac56e98561bfd82982083b7fdade34e75b67301 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Collections;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
/** Test BooleanQuery2 against BooleanQuery by overriding the standard query parser.
* This also tests the scoring order of BooleanQuery.
*/
public class TestBoolean2 extends LuceneTestCase {
private static IndexSearcher searcher;
private static IndexSearcher singleSegmentSearcher;
private static IndexSearcher bigSearcher;
private static IndexReader reader;
private static IndexReader littleReader;
private static IndexReader singleSegmentReader;
/** num of empty docs injected between every doc in the (main) index */
private static int NUM_FILLER_DOCS;
/** num of empty docs injected prior to the first doc in the (main) index */
private static int PRE_FILLER_DOCS;
/** num "extra" docs containing value in "field2" added to the "big" clone of the index */
private static final int NUM_EXTRA_DOCS = 6000;
public static final String field = "field";
private static Directory directory;
private static Directory singleSegmentDirectory;
private static Directory dir2;
private static int mulFactor;
private static Directory copyOf(Directory dir) throws IOException {
Directory copy = newFSDirectory(createTempDir());
for(String name : dir.listAll()) {
if (name.startsWith("extra")) {
continue;
}
copy.copyFrom(dir, name, name, IOContext.DEFAULT);
copy.sync(Collections.singleton(name));
}
return copy;
}
@BeforeClass
public static void beforeClass() throws Exception {
// in some runs, test immediate adjacency of matches - in others, force a full bucket gap between docs
NUM_FILLER_DOCS = random().nextBoolean() ? 0 : BooleanScorer.SIZE;
PRE_FILLER_DOCS = TestUtil.nextInt(random(), 0, (NUM_FILLER_DOCS / 2));
if (VERBOSE) {
System.out.println("TEST: NUM_FILLER_DOCS=" + NUM_FILLER_DOCS + " PRE_FILLER_DOCS=" + PRE_FILLER_DOCS);
}
if (NUM_FILLER_DOCS * PRE_FILLER_DOCS > 100000) {
directory = newFSDirectory(createTempDir());
} else {
directory = newDirectory();
}
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
// randomized codecs are sometimes too costly for this test:
iwc.setCodec(TestUtil.getDefaultCodec());
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer= new RandomIndexWriter(random(), directory, iwc);
// we'll make a ton of docs, disable store/norms/vectors
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setOmitNorms(true);
Document doc = new Document();
for (int filler = 0; filler < PRE_FILLER_DOCS; filler++) {
writer.addDocument(doc);
}
for (int i = 0; i < docFields.length; i++) {
doc.add(new Field(field, docFields[i], ft));
writer.addDocument(doc);
doc = new Document();
for (int filler = 0; filler < NUM_FILLER_DOCS; filler++) {
writer.addDocument(doc);
}
}
writer.close();
littleReader = DirectoryReader.open(directory);
searcher = newSearcher(littleReader);
// this is intentionally using the baseline sim, because it compares against bigSearcher (which uses a random one)
searcher.setSimilarity(new ClassicSimilarity());
// make a copy of our index using a single segment
if (NUM_FILLER_DOCS * PRE_FILLER_DOCS > 100000) {
singleSegmentDirectory = newFSDirectory(createTempDir());
} else {
singleSegmentDirectory = newDirectory();
}
// TODO: this test does not need to be doing this crazy stuff. please improve it!
for (String fileName : directory.listAll()) {
if (fileName.startsWith("extra")) {
continue;
}
singleSegmentDirectory.copyFrom(directory, fileName, fileName, IOContext.DEFAULT);
singleSegmentDirectory.sync(Collections.singleton(fileName));
}
iwc = newIndexWriterConfig(new MockAnalyzer(random()));
// we need docID order to be preserved:
// randomized codecs are sometimes too costly for this test:
iwc.setCodec(TestUtil.getDefaultCodec());
iwc.setMergePolicy(newLogMergePolicy());
try (IndexWriter w = new IndexWriter(singleSegmentDirectory, iwc)) {
w.forceMerge(1, true);
}
singleSegmentReader = DirectoryReader.open(singleSegmentDirectory);
singleSegmentSearcher = newSearcher(singleSegmentReader);
singleSegmentSearcher.setSimilarity(searcher.getSimilarity());
// Make big index
dir2 = copyOf(directory);
// First multiply small test index:
mulFactor = 1;
int docCount = 0;
if (VERBOSE) {
System.out.println("\nTEST: now copy index...");
}
do {
if (VERBOSE) {
System.out.println("\nTEST: cycle...");
}
final Directory copy = copyOf(dir2);
iwc = newIndexWriterConfig(new MockAnalyzer(random()));
// randomized codecs are sometimes too costly for this test:
iwc.setCodec(TestUtil.getDefaultCodec());
RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
w.addIndexes(copy);
copy.close();
docCount = w.getDocStats().maxDoc;
w.close();
mulFactor *= 2;
} while(docCount < 3000 * NUM_FILLER_DOCS);
iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000));
// randomized codecs are sometimes too costly for this test:
iwc.setCodec(TestUtil.getDefaultCodec());
RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
doc = new Document();
doc.add(new Field("field2", "xxx", ft));
for(int i=0;i<NUM_EXTRA_DOCS/2;i++) {
w.addDocument(doc);
}
doc = new Document();
doc.add(new Field("field2", "big bad bug", ft));
for(int i=0;i<NUM_EXTRA_DOCS/2;i++) {
w.addDocument(doc);
}
reader = w.getReader();
bigSearcher = newSearcher(reader);
w.close();
}
@AfterClass
public static void afterClass() throws Exception {
reader.close();
littleReader.close();
singleSegmentReader.close();
dir2.close();
directory.close();
singleSegmentDirectory.close();
singleSegmentSearcher = null;
singleSegmentReader = null;
singleSegmentDirectory = null;
searcher = null;
reader = null;
littleReader = null;
dir2 = null;
directory = null;
bigSearcher = null;
}
private static String[] docFields = {
"w1 w2 w3 w4 w5",
"w1 w3 w2 w3",
"w1 xx w2 yy w3",
"w1 w3 xx w2 yy mm"
};
public void queriesTest(Query query, int[] expDocNrs) throws Exception {
// adjust the expected doc numbers according to our filler docs
if (0 < NUM_FILLER_DOCS) {
expDocNrs = ArrayUtil.copyOfSubArray(expDocNrs, 0, expDocNrs.length);
for (int i=0; i < expDocNrs.length; i++) {
expDocNrs[i] = PRE_FILLER_DOCS + ((NUM_FILLER_DOCS + 1) * expDocNrs[i]);
}
}
final int topDocsToCheck = atLeast(1000);
// The asserting searcher will sometimes return the bulk scorer and
// sometimes return a default impl around the scorer so that we can
// compare BS1 and BS2
TopScoreDocCollector collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
searcher.search(query, collector);
ScoreDoc[] hits1 = collector.topDocs().scoreDocs;
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
searcher.search(query, collector);
ScoreDoc[] hits2 = collector.topDocs().scoreDocs;
CheckHits.checkHitsQuery(query, hits1, hits2, expDocNrs);
// Since we have no deleted docs, we should also be able to verify identical matches &
// scores against an single segment copy of our index
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
singleSegmentSearcher.search(query, collector);
hits2 = collector.topDocs().scoreDocs;
CheckHits.checkHitsQuery(query, hits1, hits2, expDocNrs);
// sanity check expected num matches in bigSearcher
assertEquals(mulFactor * collector.totalHits,
bigSearcher.count(query));
// now check 2 diff scorers from the bigSearcher as well
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
bigSearcher.search(query, collector);
hits1 = collector.topDocs().scoreDocs;
collector = TopScoreDocCollector.create(topDocsToCheck, Integer.MAX_VALUE);
bigSearcher.search(query, collector);
hits2 = collector.topDocs().scoreDocs;
// NOTE: just comparing results, not vetting against expDocNrs
// since we have dups in bigSearcher
CheckHits.checkEqual(query, hits1, hits2);
}
@Test
public void testQueries01() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST);
int[] expDocNrs = {2,3};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries02() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.SHOULD);
int[] expDocNrs = {2,3,1,0};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries03() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.SHOULD);
int[] expDocNrs = {2,3,1,0};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries04() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
int[] expDocNrs = {1,0};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries05() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
int[] expDocNrs = {1,0};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries06() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
query.add(new TermQuery(new Term(field, "w5")), BooleanClause.Occur.MUST_NOT);
int[] expDocNrs = {1};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries07() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST_NOT);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST_NOT);
query.add(new TermQuery(new Term(field, "w5")), BooleanClause.Occur.MUST_NOT);
int[] expDocNrs = {};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries08() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.SHOULD);
query.add(new TermQuery(new Term(field, "w5")), BooleanClause.Occur.MUST_NOT);
int[] expDocNrs = {2,3,1};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testQueries09() throws Exception {
BooleanQuery.Builder query = new BooleanQuery.Builder();
query.add(new TermQuery(new Term(field, "w3")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "xx")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "w2")), BooleanClause.Occur.MUST);
query.add(new TermQuery(new Term(field, "zz")), BooleanClause.Occur.SHOULD);
int[] expDocNrs = {2, 3};
queriesTest(query.build(), expDocNrs);
}
@Test
public void testRandomQueries() throws Exception {
String[] vals = {"w1","w2","w3","w4","w5","xx","yy","zzz"};
int tot=0;
BooleanQuery q1 = null;
try {
// increase number of iterations for more complete testing
int num = atLeast(3);
for (int i=0; i<num; i++) {
int level = random().nextInt(3);
q1 = randBoolQuery(new Random(random().nextLong()), random().nextBoolean(), level, field, vals, null).build();
// Can't sort by relevance since floating point numbers may not quite
// match up.
Sort sort = Sort.INDEXORDER;
QueryUtils.check(random(), q1,searcher); // baseline sim
try {
// a little hackish, QueryUtils.check is too costly to do on bigSearcher in this loop.
searcher.setSimilarity(bigSearcher.getSimilarity()); // random sim
QueryUtils.check(random(), q1, searcher);
} finally {
searcher.setSimilarity(new ClassicSimilarity()); // restore
}
// check diff (randomized) scorers (from AssertingSearcher) produce the same results
TopFieldCollector collector = TopFieldCollector.create(sort, 1000, 1);
searcher.search(q1, collector);
ScoreDoc[] hits1 = collector.topDocs().scoreDocs;
collector = TopFieldCollector.create(sort, 1000, 1);
searcher.search(q1, collector);
ScoreDoc[] hits2 = collector.topDocs().scoreDocs;
tot+=hits2.length;
CheckHits.checkEqual(q1, hits1, hits2);
BooleanQuery.Builder q3 = new BooleanQuery.Builder();
q3.add(q1, BooleanClause.Occur.SHOULD);
q3.add(new PrefixQuery(new Term("field2", "b")), BooleanClause.Occur.SHOULD);
assertEquals(mulFactor*collector.totalHits + NUM_EXTRA_DOCS/2, bigSearcher.count(q3.build()));
// test diff (randomized) scorers produce the same results on bigSearcher as well
collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);
bigSearcher.search(q1, collector);
hits1 = collector.topDocs().scoreDocs;
collector = TopFieldCollector.create(sort, 1000 * mulFactor, 1);
bigSearcher.search(q1, collector);
hits2 = collector.topDocs().scoreDocs;
CheckHits.checkEqual(q1, hits1, hits2);
}
} catch (Exception e) {
// For easier debugging
System.out.println("failed query: " + q1);
throw e;
}
// System.out.println("Total hits:"+tot);
}
// used to set properties or change every BooleanQuery
// generated from randBoolQuery.
public static interface Callback {
public void postCreate(BooleanQuery.Builder q);
}
// Random rnd is passed in so that the exact same random query may be created
// more than once.
public static BooleanQuery.Builder randBoolQuery(Random rnd, boolean allowMust, int level, String field, String[] vals, Callback cb) {
BooleanQuery.Builder current = new BooleanQuery.Builder();
for (int i=0; i<rnd.nextInt(vals.length)+1; i++) {
int qType=0; // term query
if (level>0) {
qType = rnd.nextInt(10);
}
Query q;
if (qType < 3) {
q = new TermQuery(new Term(field, vals[rnd.nextInt(vals.length)]));
} else if (qType < 4) {
String t1 = vals[rnd.nextInt(vals.length)];
String t2 = vals[rnd.nextInt(vals.length)];
q = new PhraseQuery(10, field, t1, t2); // slop increases possibility of matching
} else if (qType < 7) {
q = new WildcardQuery(new Term(field, "w*"));
} else {
q = randBoolQuery(rnd, allowMust, level-1, field, vals, cb).build();
}
int r = rnd.nextInt(10);
BooleanClause.Occur occur;
if (r<2) {
occur=BooleanClause.Occur.MUST_NOT;
}
else if (r<5) {
if (allowMust) {
occur=BooleanClause.Occur.MUST;
} else {
occur=BooleanClause.Occur.SHOULD;
}
} else {
occur=BooleanClause.Occur.SHOULD;
}
current.add(q, occur);
}
if (cb!=null) cb.postCreate(current);
return current;
}
}