blob: 9cfdd9ab4ca8bde7e8b9dc19eed19837c916ed1c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/**
* Tests {@link PhraseQuery}.
*
* @see TestPositionIncrement
*/
public class TestPhraseQuery extends LuceneTestCase {
/** threshold for comparing floats */
public static final float SCORE_COMP_THRESH = 1e-6f;
private static IndexSearcher searcher;
private static IndexReader reader;
private PhraseQuery query;
private static Directory directory;
@BeforeClass
public static void beforeClass() throws Exception {
directory = newDirectory();
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
}
@Override
public int getPositionIncrementGap(String fieldName) {
return 100;
}
};
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, analyzer);
Document doc = new Document();
doc.add(newTextField("field", "one two three four five", Field.Store.YES));
doc.add(newTextField("repeated", "this is a repeated field - first part", Field.Store.YES));
Field repeatedField = newTextField("repeated", "second part of a repeated field", Field.Store.YES);
doc.add(repeatedField);
doc.add(newTextField("palindrome", "one two three two one", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newTextField("nonexist", "phrase exist notexist exist found", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newTextField("nonexist", "phrase exist notexist exist found", Field.Store.YES));
writer.addDocument(doc);
reader = writer.getReader();
writer.close();
searcher = new IndexSearcher(reader);
}
@Override
public void setUp() throws Exception {
super.setUp();
}
@AfterClass
public static void afterClass() throws Exception {
searcher = null;
reader.close();
reader = null;
directory.close();
directory = null;
}
public void testNotCloseEnough() throws Exception {
query = new PhraseQuery(2, "field", "one", "five");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(0, hits.length);
QueryUtils.check(random(), query,searcher);
}
public void testBarelyCloseEnough() throws Exception {
query = new PhraseQuery(3, "field", "one", "five");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
QueryUtils.check(random(), query, searcher);
}
/**
* Ensures slop of 0 works for exact matches, but not reversed
*/
public void testExact() throws Exception {
// slop is zero by default
query = new PhraseQuery("field", "four", "five");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("exact match", 1, hits.length);
QueryUtils.check(random(), query,searcher);
query = new PhraseQuery("field", "two", "one");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("reverse not exact", 0, hits.length);
QueryUtils.check(random(), query,searcher);
}
public void testSlop1() throws Exception {
// Ensures slop of 1 works with terms in order.
query = new PhraseQuery(1, "field", "one", "two");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("in order", 1, hits.length);
QueryUtils.check(random(), query,searcher);
// Ensures slop of 1 does not work for phrases out of order;
// must be at least 2.
query = new PhraseQuery(1, "field", "two", "one");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("reversed, slop not 2 or more", 0, hits.length);
QueryUtils.check(random(), query,searcher);
}
/**
* As long as slop is at least 2, terms can be reversed
*/
public void testOrderDoesntMatter() throws Exception {
// must be at least two for reverse order match
query = new PhraseQuery(2, "field", "two", "one");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("just sloppy enough", 1, hits.length);
QueryUtils.check(random(), query,searcher);
query = new PhraseQuery(2, "field", "three", "one");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("not sloppy enough", 0, hits.length);
QueryUtils.check(random(), query,searcher);
}
/**
* slop is the total number of positional moves allowed
* to line up a phrase
*/
public void testMultipleTerms() throws Exception {
query = new PhraseQuery(2, "field", "one", "three", "five");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("two total moves", 1, hits.length);
QueryUtils.check(random(), query,searcher);
// it takes six moves to match this phrase
query = new PhraseQuery(5, "field", "five", "three", "one");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("slop of 5 not close enough", 0, hits.length);
QueryUtils.check(random(), query,searcher);
query = new PhraseQuery(6, "field", "five", "three", "one");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("slop of 6 just right", 1, hits.length);
QueryUtils.check(random(), query,searcher);
}
public void testPhraseQueryWithStopAnalyzer() throws Exception {
Directory directory = newDirectory();
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(stopAnalyzer));
Document doc = new Document();
doc.add(newTextField("field", "the stop words are here", Field.Store.YES));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
// valid exact phrase query
PhraseQuery query = new PhraseQuery("field", "stop", "words");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
QueryUtils.check(random(), query,searcher);
reader.close();
directory.close();
}
public void testPhraseQueryInConjunctionScorer() throws Exception {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
Document doc = new Document();
doc.add(newTextField("source", "marketing info", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newTextField("contents", "foobar", Field.Store.YES));
doc.add(newTextField("source", "marketing info", Field.Store.YES));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
PhraseQuery phraseQuery = new PhraseQuery("source", "marketing", "info");
ScoreDoc[] hits = searcher.search(phraseQuery, 1000).scoreDocs;
assertEquals(2, hits.length);
QueryUtils.check(random(), phraseQuery,searcher);
TermQuery termQuery = new TermQuery(new Term("contents","foobar"));
BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST);
hits = searcher.search(booleanQuery.build(), 1000).scoreDocs;
assertEquals(1, hits.length);
QueryUtils.check(random(), termQuery,searcher);
reader.close();
writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE));
doc = new Document();
doc.add(newTextField("contents", "map entry woo", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newTextField("contents", "woo map entry", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(newTextField("contents", "map foobarword entry woo", Field.Store.YES));
writer.addDocument(doc);
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
termQuery = new TermQuery(new Term("contents","woo"));
phraseQuery = new PhraseQuery("contents", "map", "entry");
hits = searcher.search(termQuery, 1000).scoreDocs;
assertEquals(3, hits.length);
hits = searcher.search(phraseQuery, 1000).scoreDocs;
assertEquals(2, hits.length);
booleanQuery = new BooleanQuery.Builder();
booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST);
hits = searcher.search(booleanQuery.build(), 1000).scoreDocs;
assertEquals(2, hits.length);
booleanQuery = new BooleanQuery.Builder();
booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST);
booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
hits = searcher.search(booleanQuery.build(), 1000).scoreDocs;
assertEquals(2, hits.length);
QueryUtils.check(random(), booleanQuery.build(),searcher);
reader.close();
directory.close();
}
public void testSlopScoring() throws IOException {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(new MockAnalyzer(random()))
.setMergePolicy(newLogMergePolicy())
.setSimilarity(new BM25Similarity()));
Document doc = new Document();
doc.add(newTextField("field", "foo firstname lastname foo", Field.Store.YES));
writer.addDocument(doc);
Document doc2 = new Document();
doc2.add(newTextField("field", "foo firstname zzz lastname foo", Field.Store.YES));
writer.addDocument(doc2);
Document doc3 = new Document();
doc3.add(newTextField("field", "foo firstname zzz yyy lastname foo", Field.Store.YES));
writer.addDocument(doc3);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new ClassicSimilarity());
PhraseQuery query = new PhraseQuery(Integer.MAX_VALUE, "field", "firstname", "lastname");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(3, hits.length);
// Make sure that those matches where the terms appear closer to
// each other get a higher score:
assertEquals(1.0, hits[0].score, 0.01);
assertEquals(0, hits[0].doc);
assertEquals(0.63, hits[1].score, 0.01);
assertEquals(1, hits[1].doc);
assertEquals(0.47, hits[2].score, 0.01);
assertEquals(2, hits[2].doc);
QueryUtils.check(random(), query,searcher);
reader.close();
directory.close();
}
public void testToString() throws Exception {
PhraseQuery q = new PhraseQuery("field", new String[0]);
assertEquals("\"\"", q.toString());
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("field", "hi"), 1);
q = builder.build();
assertEquals("field:\"? hi\"", q.toString());
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "hi"), 1);
builder.add(new Term("field", "test"), 5);
q = builder.build(); // Query "this hi this is a test is"
assertEquals("field:\"? hi ? ? ? test\"", q.toString());
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "hi"), 1);
builder.add(new Term("field", "hello"), 1);
builder.add(new Term("field", "test"), 5);
q = builder.build();
assertEquals("field:\"? hi|hello ? ? ? test\"", q.toString());
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "hi"), 1);
builder.add(new Term("field", "hello"), 1);
builder.add(new Term("field", "test"), 5);
builder.setSlop(5);
q = builder.build();
assertEquals("field:\"? hi|hello ? ? ? test\"~5", q.toString());
}
public void testWrappedPhrase() throws IOException {
query = new PhraseQuery(100, "repeated", "first", "part", "second", "part");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("slop of 100 just right", 1, hits.length);
QueryUtils.check(random(), query,searcher);
query = new PhraseQuery(99, "repeated", "first", "part", "second", "part");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("slop of 99 not enough", 0, hits.length);
QueryUtils.check(random(), query,searcher);
}
// work on two docs like this: "phrase exist notexist exist found"
public void testNonExistingPhrase() throws IOException {
// phrase without repetitions that exists in 2 docs
query = new PhraseQuery(2, "nonexist", "phrase", "notexist", "found");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("phrase without repetitions exists in 2 docs", 2, hits.length);
QueryUtils.check(random(), query,searcher);
// phrase with repetitions that exists in 2 docs
query = new PhraseQuery(1, "nonexist", "phrase", "exist", "exist");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("phrase with repetitions exists in two docs", 2, hits.length);
QueryUtils.check(random(), query,searcher);
// phrase I with repetitions that does not exist in any doc
query = new PhraseQuery(1000, "nonexist", "phrase", "notexist", "phrase");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length);
QueryUtils.check(random(), query,searcher);
// phrase II with repetitions that does not exist in any doc
query = new PhraseQuery(1000, "nonexist", "phrase", "exist", "exist", "exist");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("nonexisting phrase with repetitions does not exist in any doc", 0, hits.length);
QueryUtils.check(random(), query,searcher);
}
/**
* Working on a 2 fields like this:
* Field("field", "one two three four five")
* Field("palindrome", "one two three two one")
* Phrase of size 2 occuriong twice, once in order and once in reverse,
* because doc is a palyndrome, is counted twice.
* Also, in this case order in query does not matter.
* Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
*/
public void testPalyndrome2() throws Exception {
// search on non palyndrome, find phrase with no slop, using exact phrase scorer
query = new PhraseQuery("field", "two", "three"); // to use exact phrase scorer
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("phrase found with exact phrase scorer", 1, hits.length);
float score0 = hits[0].score;
//System.out.println("(exact) field: two three: "+score0);
QueryUtils.check(random(), query,searcher);
// search on non palyndrome, find phrase with slop 2, though no slop required here.
query = new PhraseQuery("field", "two", "three"); // to use sloppy scorer
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("just sloppy enough", 1, hits.length);
float score1 = hits[0].score;
//System.out.println("(sloppy) field: two three: "+score1);
assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);
QueryUtils.check(random(), query,searcher);
// search ordered in palyndrome, find it twice
query = new PhraseQuery(2, "palindrome", "two", "three"); // must be at least two for both ordered and reversed to match
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("just sloppy enough", 1, hits.length);
//float score2 = hits[0].score;
//System.out.println("palindrome: two three: "+score2);
QueryUtils.check(random(), query,searcher);
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
//assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
// search reveresed in palyndrome, find it twice
query = new PhraseQuery(2, "palindrome", "three", "two"); // must be at least two for both ordered and reversed to match
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("just sloppy enough", 1, hits.length);
//float score3 = hits[0].score;
//System.out.println("palindrome: three two: "+score3);
QueryUtils.check(random(), query,searcher);
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
//assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
//assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
}
/**
* Working on a 2 fields like this:
* Field("field", "one two three four five")
* Field("palindrome", "one two three two one")
* Phrase of size 3 occuriong twice, once in order and once in reverse,
* because doc is a palyndrome, is counted twice.
* Also, in this case order in query does not matter.
* Also, when an exact match is found, both sloppy scorer and exact scorer scores the same.
*/
public void testPalyndrome3() throws Exception {
// search on non palyndrome, find phrase with no slop, using exact phrase scorer
// slop=0 to use exact phrase scorer
query = new PhraseQuery(0, "field", "one", "two", "three");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("phrase found with exact phrase scorer", 1, hits.length);
float score0 = hits[0].score;
//System.out.println("(exact) field: one two three: "+score0);
QueryUtils.check(random(), query,searcher);
// just make sure no exc:
searcher.explain(query, 0);
// search on non palyndrome, find phrase with slop 3, though no slop required here.
// slop=4 to use sloppy scorer
query = new PhraseQuery(4, "field", "one", "two", "three");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("just sloppy enough", 1, hits.length);
float score1 = hits[0].score;
//System.out.println("(sloppy) field: one two three: "+score1);
assertEquals("exact scorer and sloppy scorer score the same when slop does not matter",score0, score1, SCORE_COMP_THRESH);
QueryUtils.check(random(), query,searcher);
// search ordered in palyndrome, find it twice
// slop must be at least four for both ordered and reversed to match
query = new PhraseQuery(4, "palindrome", "one", "two", "three");
hits = searcher.search(query, 1000).scoreDocs;
// just make sure no exc:
searcher.explain(query, 0);
assertEquals("just sloppy enough", 1, hits.length);
//float score2 = hits[0].score;
//System.out.println("palindrome: one two three: "+score2);
QueryUtils.check(random(), query,searcher);
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
//assertTrue("ordered scores higher in palindrome",score1+SCORE_COMP_THRESH<score2);
// search reveresed in palyndrome, find it twice
// must be at least four for both ordered and reversed to match
query = new PhraseQuery(4, "palindrome", "three", "two", "one");
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("just sloppy enough", 1, hits.length);
//float score3 = hits[0].score;
//System.out.println("palindrome: three two one: "+score3);
QueryUtils.check(random(), query,searcher);
//commented out for sloppy-phrase efficiency (issue 736) - see SloppyPhraseScorer.phraseFreq().
//assertTrue("reversed scores higher in palindrome",score1+SCORE_COMP_THRESH<score3);
//assertEquals("ordered or reversed does not matter",score2, score3, SCORE_COMP_THRESH);
}
// LUCENE-1280
public void testEmptyPhraseQuery() throws Throwable {
final BooleanQuery.Builder q2 = new BooleanQuery.Builder();
q2.add(new PhraseQuery("field", new String[0]), BooleanClause.Occur.MUST);
q2.build().toString();
}
/* test that a single term is rewritten to a term query */
public void testRewrite() throws IOException {
PhraseQuery pq = new PhraseQuery("foo", "bar");
Query rewritten = pq.rewrite(searcher.getIndexReader());
assertTrue(rewritten instanceof TermQuery);
}
/** Tests PhraseQuery with terms at the same position in the query. */
public void testZeroPosIncr() throws IOException {
Directory dir = newDirectory();
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("aa");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("b");
tokens[2].setPositionIncrement(1);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
// Sanity check; simple "a b" phrase:
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.count(pqBuilder.build()));
// Now with "a|aa b"
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "aa"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.count(pqBuilder.build()));
// Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "z"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(0, searcher.count(pqBuilder.build()));
r.close();
dir.close();
}
public void testRandomPhrases() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
List<List<String>> docs = new ArrayList<>();
Document d = new Document();
Field f = newTextField("f", "", Field.Store.NO);
d.add(f);
Random r = random();
int NUM_DOCS = atLeast(10);
for (int i = 0; i < NUM_DOCS; i++) {
// at night, must be > 4096 so it spans multiple chunks
int termCount = TEST_NIGHTLY ? atLeast(4097) : atLeast(200);
List<String> doc = new ArrayList<>();
StringBuilder sb = new StringBuilder();
while(doc.size() < termCount) {
if (r.nextInt(5) == 1 || docs.size() == 0) {
// make new non-empty-string term
String term;
while(true) {
term = TestUtil.randomUnicodeString(r);
if (term.length() > 0) {
break;
}
}
try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while(ts.incrementToken()) {
String text = termAttr.toString();
doc.add(text);
sb.append(text).append(' ');
}
ts.end();
}
} else {
// pick existing sub-phrase
List<String> lastDoc = docs.get(r.nextInt(docs.size()));
int len = TestUtil.nextInt(r, 1, 10);
int start = r.nextInt(lastDoc.size()-len);
for(int k=start;k<start+len;k++) {
String t = lastDoc.get(k);
doc.add(t);
sb.append(t).append(' ');
}
}
}
docs.add(doc);
f.setStringValue(sb.toString());
w.addDocument(d);
}
IndexReader reader = w.getReader();
IndexSearcher s = newSearcher(reader);
w.close();
// now search
int num = atLeast(3);
for(int i=0;i<num;i++) {
int docID = r.nextInt(docs.size());
List<String> doc = docs.get(docID);
final int numTerm = TestUtil.nextInt(r, 2, 20);
final int start = r.nextInt(doc.size()-numTerm);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
StringBuilder sb = new StringBuilder();
for(int t=start;t<start+numTerm;t++) {
builder.add(new Term("f", doc.get(t)), t);
sb.append(doc.get(t)).append(' ');
}
PhraseQuery pq = builder.build();
TopDocs hits = s.search(pq, NUM_DOCS);
boolean found = false;
for(int j=0;j<hits.scoreDocs.length;j++) {
if (hits.scoreDocs[j].doc == docID) {
found = true;
break;
}
}
assertTrue("phrase '" + sb + "' not found; start=" + start + ", it=" + i + ", expected doc " + docID, found);
}
reader.close();
dir.close();
}
public void testNegativeSlop() throws Exception {
expectThrows(IllegalArgumentException.class, () -> {
new PhraseQuery(-2, "field", "two", "one");
});
}
public void testNegativePosition() throws Exception {
PhraseQuery.Builder builder = new PhraseQuery.Builder();
expectThrows(IllegalArgumentException.class, () -> {
builder.add(new Term("field", "two"), -42);
});
}
public void testBackwardPositions() throws Exception {
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("field", "one"), 1);
builder.add(new Term("field", "two"), 5);
expectThrows(IllegalArgumentException.class, () -> {
builder.add(new Term("field", "three"), 4);
});
}
static String[] DOCS = new String[] {
"a b c d e f g h",
"b c b",
"c d d d e f g b",
"c b a b c",
"a a b b c c d d",
"a b c d a b c d a b c d"
};
public void testTopPhrases() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
String[] docs = ArrayUtil.copyOfSubArray(DOCS, 0, DOCS.length);
Collections.shuffle(Arrays.asList(docs), random());
for (String value : DOCS) {
Document doc = new Document();
doc.add(new TextField("f", value, Store.NO));
w.addDocument(doc);
}
IndexReader r = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(r);
for (Query query : Arrays.asList(
new PhraseQuery("f", "b", "c"), // common phrase
new PhraseQuery("f", "e", "f"), // always appear next to each other
new PhraseQuery("f", "d", "d") // repeated term
)) {
for (int topN = 1; topN <= 2; ++topN) {
TopScoreDocCollector collector1 = TopScoreDocCollector.create(topN, null, Integer.MAX_VALUE);
searcher.search(query, collector1);
ScoreDoc[] hits1 = collector1.topDocs().scoreDocs;
TopScoreDocCollector collector2 = TopScoreDocCollector.create(topN, null, 1);
searcher.search(query, collector2);
ScoreDoc[] hits2 = collector2.topDocs().scoreDocs;
assertTrue("" + query, hits1.length > 0);
CheckHits.checkEqual(query, hits1, hits2);
}
}
r.close();
dir.close();
}
public void testMergeImpacts() throws IOException {
DummyImpactsEnum impacts1 = new DummyImpactsEnum(1000);
DummyImpactsEnum impacts2 = new DummyImpactsEnum(2000);
ImpactsSource mergedImpacts = ExactPhraseMatcher.mergeImpacts(new ImpactsEnum[] { impacts1, impacts2 });
impacts1.reset(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
});
// Merge with empty impacts
impacts2.reset(
new Impact[0][],
new int[0]);
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Merge with dummy impacts
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(Integer.MAX_VALUE, 1) }
},
new int[] {
5000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Merge with dummy impacts that we don't special case
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(Integer.MAX_VALUE, 2) }
},
new int[] {
5000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// First level of impacts2 doesn't cover the first level of impacts1
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 13) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
90,
1000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(7, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Second level of impacts2 doesn't cover the first level of impacts1
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 11) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
150,
900
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(3, 11), new Impact(5, 12), new Impact(6, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) } // same as impacts1
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(4, 10), new Impact(9, 13) },
new Impact[] { new Impact(1, 1), new Impact(4, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14), new Impact(13, 15) }
},
new int[] {
113,
950
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(4, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Make sure negative norms are treated as unsigned
impacts1.reset(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, -10), new Impact(8, -5) },
new Impact[] { new Impact(3, 10), new Impact(5, -15), new Impact(8, -5), new Impact(12, -3) }
},
new int[] {
110,
945
});
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(12, -4) },
new Impact[] { new Impact(3, 9), new Impact(12, -4), new Impact(20, -1) }
},
new int[] {
150,
960
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(8, -4) },
new Impact[] { new Impact(3, 10), new Impact(8, -4), new Impact(12, -3) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
}
private static void assertEquals(Impact[][] impacts, int[] docIdUpTo, Impacts actual) {
assertEquals(impacts.length, actual.numLevels());
for (int i = 0; i < impacts.length; ++i) {
assertEquals(docIdUpTo[i], actual.getDocIdUpTo(i));
assertEquals(Arrays.asList(impacts[i]), actual.getImpacts(i));
}
}
private static class DummyImpactsEnum extends ImpactsEnum {
private final long cost;
private Impact[][] impacts;
private int[] docIdUpTo;
DummyImpactsEnum(long cost) {
this.cost = cost;
}
void reset(Impact[][] impacts, int[] docIdUpTo) {
this.impacts = impacts;
this.docIdUpTo = docIdUpTo;
}
@Override
public void advanceShallow(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return impacts.length;
}
@Override
public int getDocIdUpTo(int level) {
return docIdUpTo[level];
}
@Override
public List<Impact> getImpacts(int level) {
return Arrays.asList(impacts[level]);
}
};
}
@Override
public int freq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int nextPosition() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef getPayload() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
throw new UnsupportedOperationException();
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return cost;
}
}
public void testRandomTopDocs() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
int numDocs = TEST_NIGHTLY ? atLeast(128 * 8 * 8 * 3) : atLeast(100); // at night, make sure some terms have skip data
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
int numTerms = random().nextInt(1 << random().nextInt(5));
String text = IntStream.range(0, numTerms)
.mapToObj(index -> random().nextBoolean() ? "a" : random().nextBoolean() ? "b" : "c")
.collect(Collectors.joining(" "));
doc.add(new TextField("foo", text, Store.NO));
w.addDocument(doc);
}
IndexReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(reader);
for (String firstTerm : new String[] {"a", "b", "c"}) {
for (String secondTerm : new String[] {"a", "b", "c"}) {
Query query = new PhraseQuery("foo", new BytesRef(firstTerm), new BytesRef(secondTerm));
TopScoreDocCollector collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
TopScoreDocCollector collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
searcher.search(query, collector1);
searcher.search(query, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
Query filteredQuery = new BooleanQuery.Builder()
.add(query, Occur.MUST)
.add(new TermQuery(new Term("foo", "b")), Occur.FILTER)
.build();
collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
searcher.search(filteredQuery, collector1);
searcher.search(filteredQuery, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
}
}
reader.close();
dir.close();
}
public void testNullTerm() {
NullPointerException e = expectThrows(NullPointerException.class, () -> new PhraseQuery.Builder().add(null));
assertEquals("Cannot add a null term to PhraseQuery", e.getMessage());
e = expectThrows(NullPointerException.class, () -> new PhraseQuery("field", (BytesRef)null));
assertEquals("Cannot add a null term to PhraseQuery", e.getMessage());
e = expectThrows(NullPointerException.class, () -> new PhraseQuery("field", (String)null));
assertEquals("Cannot add a null term to PhraseQuery", e.getMessage());
}
}