blob: bd9ce08f93682aac861903e26e2d49b5823d9321 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Ignore;
/**
* This class tests the MultiPhraseQuery class.
*
*
*/
public class TestMultiPhraseQuery extends LuceneTestCase {
public void testPhrasePrefix() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("blueberry pie", writer);
add("blueberry strudel", writer);
add("blueberry pizza", writer);
add("blueberry chewing gum", writer);
add("bluebird pizza", writer);
add("bluebird foobar pizza", writer);
add("piccadilly circus", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// search for "blueberry pi*":
MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
// search for "strawberry pi*":
MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
query1builder.add(new Term("body", "blueberry"));
query2builder.add(new Term("body", "strawberry"));
LinkedList<Term> termsWithPrefix = new LinkedList<>();
// this TermEnum gives "piccadilly", "pie" and "pizza".
String prefix = "pi";
TermsEnum te = MultiTerms.getTerms(reader,"body").iterator();
te.seekCeil(new BytesRef(prefix));
do {
String s = te.term().utf8ToString();
if (s.startsWith(prefix)) {
termsWithPrefix.add(new Term("body", s));
} else {
break;
}
} while (te.next() != null);
query1builder.add(termsWithPrefix.toArray(new Term[0]));
MultiPhraseQuery query1 = query1builder.build();
assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
query2builder.add(termsWithPrefix.toArray(new Term[0]));
MultiPhraseQuery query2 = query2builder.build();
assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
ScoreDoc[] result;
result = searcher.search(query1, 1000).scoreDocs;
assertEquals(2, result.length);
result = searcher.search(query2, 1000).scoreDocs;
assertEquals(0, result.length);
// search for "blue* pizza":
MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder();
termsWithPrefix.clear();
prefix = "blue";
te.seekCeil(new BytesRef(prefix));
do {
if (te.term().utf8ToString().startsWith(prefix)) {
termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
}
} while (te.next() != null);
query3builder.add(termsWithPrefix.toArray(new Term[0]));
query3builder.add(new Term("body", "pizza"));
MultiPhraseQuery query3 = query3builder.build();
result = searcher.search(query3, 1000).scoreDocs;
assertEquals(2, result.length); // blueberry pizza, bluebird pizza
assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
// test slop:
query3builder.setSlop(1);
query3 = query3builder.build();
result = searcher.search(query3, 1000).scoreDocs;
// just make sure no exc:
searcher.explain(query3, 0);
assertEquals(3, result.length); // blueberry pizza, bluebird pizza, bluebird
// foobar pizza
MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder();
expectThrows(IllegalArgumentException.class, () -> {
query4builder.add(new Term("field1", "foo"));
query4builder.add(new Term("field2", "foobar"));
});
writer.close();
reader.close();
indexStore.close();
}
// LUCENE-2580
public void testTall() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("blueberry chocolate pie", writer);
add("blueberry chocolate tart", writer);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
qb.add(new Term("body", "blueberry"));
qb.add(new Term("body", "chocolate"));
qb.add(new Term[] {new Term("body", "pie"), new Term("body", "tart")});
assertEquals(2, searcher.count(qb.build()));
r.close();
indexStore.close();
}
@Ignore //LUCENE-3821 fixes sloppy phrase scoring, except for this known problem
public void testMultiSloppyWithRepeats() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("a b c d e f g h i k", writer);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
// this will fail, when the scorer would propagate [a] rather than [a,b],
qb.add(new Term[] {new Term("body", "a"), new Term("body", "b")});
qb.add(new Term[] {new Term("body", "a")});
qb.setSlop(6);
assertEquals(1, searcher.count(qb.build())); // should match on "a b"
r.close();
indexStore.close();
}
public void testMultiExactWithRepeats() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("a b c d e f g h i k", writer);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
qb.add(new Term[] {new Term("body", "a"), new Term("body", "d")}, 0);
qb.add(new Term[] {new Term("body", "a"), new Term("body", "f")}, 2);
assertEquals(1, searcher.count(qb.build())); // should match on "a b"
r.close();
indexStore.close();
}
private void add(String s, RandomIndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(newTextField("body", s, Field.Store.YES));
writer.addDocument(doc);
}
public void testBooleanQueryContainingSingleTermPrefixQuery()
throws IOException {
// this tests against bug 33161 (now fixed)
// In order to cause the bug, the outer query must have more than one term
// and all terms required.
// The contained PhraseMultiQuery must contain exactly one term array.
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("blueberry pie", writer);
add("blueberry chewing gum", writer);
add("blue raspberry pie", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// This query will be equivalent to +body:pie +body:"blue*"
BooleanQuery.Builder q = new BooleanQuery.Builder();
q.add(new TermQuery(new Term("body", "pie")), BooleanClause.Occur.MUST);
MultiPhraseQuery.Builder troubleBuilder = new MultiPhraseQuery.Builder();
troubleBuilder.add(new Term[] {new Term("body", "blueberry"),
new Term("body", "blue")});
q.add(troubleBuilder.build(), BooleanClause.Occur.MUST);
// exception will be thrown here without fix
ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
assertEquals("Wrong number of hits", 2, hits.length);
// just make sure no exc:
searcher.explain(q.build(), 0);
writer.close();
reader.close();
indexStore.close();
}
public void testPhrasePrefixWithBooleanQuery() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("This is a test", "object", writer);
add("a note", "note", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
// This query will be equivalent to +type:note +body:"a t*"
BooleanQuery.Builder q = new BooleanQuery.Builder();
q.add(new TermQuery(new Term("type", "note")), BooleanClause.Occur.MUST);
MultiPhraseQuery.Builder troubleBuilder = new MultiPhraseQuery.Builder();
troubleBuilder.add(new Term("body", "a"));
troubleBuilder
.add(new Term[] {new Term("body", "test"), new Term("body", "this")});
q.add(troubleBuilder.build(), BooleanClause.Occur.MUST);
// exception will be thrown here without fix for #35626:
ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs;
assertEquals("Wrong number of hits", 0, hits.length);
writer.close();
reader.close();
indexStore.close();
}
public void testNoDocs() throws Exception {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("a note", "note", writer);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
qb.add(new Term("body", "a"));
qb.add(new Term[] {new Term("body", "nope"), new Term("body", "nope")});
MultiPhraseQuery q = qb.build();
assertEquals("Wrong number of hits", 0,
searcher.count(q));
// just make sure no exc:
searcher.explain(q, 0);
writer.close();
reader.close();
indexStore.close();
}
public void testHashCodeAndEquals() {
MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
MultiPhraseQuery query1 = query1builder.build();
MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
MultiPhraseQuery query2 = query2builder.build();
assertEquals(query1.hashCode(), query2.hashCode());
assertEquals(query1, query2);
Term term1 = new Term("someField", "someText");
query1builder.add(term1);
query1 = query1builder.build();
query2builder.add(term1);
query2 = query2builder.build();
assertEquals(query1.hashCode(), query2.hashCode());
assertEquals(query1, query2);
Term term2 = new Term("someField", "someMoreText");
query1builder.add(term2);
query1 = query1builder.build();
assertFalse(query1.hashCode() == query2.hashCode());
assertFalse(query1.equals(query2));
query2builder.add(term2);
query2 = query2builder.build();
assertEquals(query1.hashCode(), query2.hashCode());
assertEquals(query1, query2);
}
private void add(String s, String type, RandomIndexWriter writer)
throws IOException {
Document doc = new Document();
doc.add(newTextField("body", s, Field.Store.YES));
doc.add(newStringField("type", type, Field.Store.NO));
writer.addDocument(doc);
}
// LUCENE-2526
public void testEmptyToString() {
new MultiPhraseQuery.Builder().build().toString();
}
public void testZeroPosIncr() throws IOException {
Directory dir = new RAMDirectory();
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("b");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("c");
tokens[2].setPositionIncrement(0);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher s = newSearcher(r);
MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
//mpq.setSlop(1);
// NOTE: not great that if we do the else clause here we
// get different scores! MultiPhraseQuery counts that
// phrase as occurring twice per doc (it should be 1, I
// think?). This is because MultipleTermPositions is able to
// return the same position more than once (0, in this
// case):
if (true) {
mpqb.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
mpqb.add(new Term[] {new Term("field", "a")}, 0);
} else {
mpqb.add(new Term[] {new Term("field", "a")}, 0);
mpqb.add(new Term[] {new Term("field", "b"), new Term("field", "c")}, 0);
}
TopDocs hits = s.search(mpqb.build(), 2);
assertEquals(2, hits.totalHits.value);
assertEquals(hits.scoreDocs[0].score, hits.scoreDocs[1].score, 1e-5);
/*
for(int hit=0;hit<hits.totalHits.value;hit++) {
ScoreDoc sd = hits.scoreDocs[hit];
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
}
*/
r.close();
dir.close();
}
private static Token makeToken(String text, int posIncr) {
final Token t = new Token();
t.append(text);
t.setPositionIncrement(posIncr);
return t;
}
private final static Token[] INCR_0_DOC_TOKENS = new Token[] {
makeToken("x", 1),
makeToken("a", 1),
makeToken("1", 0),
makeToken("m", 1), // not existing, relying on slop=2
makeToken("b", 1),
makeToken("1", 0),
makeToken("n", 1), // not existing, relying on slop=2
makeToken("c", 1),
makeToken("y", 1)
};
private final static Token[] INCR_0_QUERY_TOKENS_AND = new Token[] {
makeToken("a", 1),
makeToken("1", 0),
makeToken("b", 1),
makeToken("1", 0),
makeToken("c", 1)
};
private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_MATCH = new Token[][] {
{ makeToken("a", 1) },
{ makeToken("x", 1), makeToken("1", 0) },
{ makeToken("b", 2) },
{ makeToken("x", 2), makeToken("1", 0) },
{ makeToken("c", 3) }
};
private final static Token[][] INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN = new Token[][] {
{ makeToken("x", 1) },
{ makeToken("a", 1), makeToken("1", 0) },
{ makeToken("x", 2) },
{ makeToken("b", 2), makeToken("1", 0) },
{ makeToken("c", 3) }
};
/**
* using query parser, MPQ will be created, and will not be strict about having all query terms
* in each position - one of each position is sufficient (OR logic)
*/
public void testZeroPosIncrSloppyParsedAnd() throws IOException {
MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
qb.add(new Term[]{ new Term("field", "a"), new Term("field", "1") }, -1);
qb.add(new Term[]{ new Term("field", "b"), new Term("field", "1") }, 0);
qb.add(new Term[]{ new Term("field", "c") }, 1);
doTestZeroPosIncrSloppy(qb.build(), 0);
qb.setSlop(1);
doTestZeroPosIncrSloppy(qb.build(), 0);
qb.setSlop(2);
doTestZeroPosIncrSloppy(qb.build(), 1);
}
private void doTestZeroPosIncrSloppy(Query q, int nExpected) throws IOException {
Directory dir = newDirectory(); // random dir
IndexWriterConfig cfg = newIndexWriterConfig(null);
IndexWriter writer = new IndexWriter(dir, cfg);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(INCR_0_DOC_TOKENS)));
writer.addDocument(doc);
IndexReader r = DirectoryReader.open(writer);
writer.close();
IndexSearcher s = newSearcher(r);
if (VERBOSE) {
System.out.println("QUERY=" + q);
}
TopDocs hits = s.search(q, 1);
assertEquals("wrong number of results", nExpected, hits.totalHits.value);
if (VERBOSE) {
for(int hit=0;hit<hits.totalHits.value;hit++) {
ScoreDoc sd = hits.scoreDocs[hit];
System.out.println(" hit doc=" + sd.doc + " score=" + sd.score);
}
}
r.close();
dir.close();
}
/**
* PQ AND Mode - Manually creating a phrase query
*/
public void testZeroPosIncrSloppyPqAnd() throws IOException {
PhraseQuery.Builder builder = new PhraseQuery.Builder();
int pos = -1;
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
pos += tap.getPositionIncrement();
builder.add(new Term("field", tap.toString()), pos);
}
builder.setSlop(0);
doTestZeroPosIncrSloppy(builder.build(), 0);
builder.setSlop(1);
doTestZeroPosIncrSloppy(builder.build(), 0);
builder.setSlop(2);
doTestZeroPosIncrSloppy(builder.build(), 1);
}
/**
* MPQ AND Mode - Manually creating a multiple phrase query
*/
public void testZeroPosIncrSloppyMpqAnd() throws IOException {
final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
int pos = -1;
for (Token tap : INCR_0_QUERY_TOKENS_AND) {
pos += tap.getPositionIncrement();
mpqb.add(new Term[]{new Term("field",tap.toString())}, pos); //AND logic
}
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(1);
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(2);
doTestZeroPosIncrSloppy(mpqb.build(), 1);
}
/**
* MPQ Combined AND OR Mode - Manually creating a multiple phrase query
*/
public void testZeroPosIncrSloppyMpqAndOrMatch() throws IOException {
final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_MATCH) {
Term[] terms = tapTerms(tap);
final int pos = tap[0].getPositionIncrement()-1;
mpqb.add(terms, pos); //AND logic in pos, OR across lines
}
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(1);
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(2);
doTestZeroPosIncrSloppy(mpqb.build(), 1);
}
/**
* MPQ Combined AND OR Mode - Manually creating a multiple phrase query - with no match
*/
public void testZeroPosIncrSloppyMpqAndOrNoMatch() throws IOException {
final MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
for (Token tap[] : INCR_0_QUERY_TOKENS_AND_OR_NO_MATCHN) {
Term[] terms = tapTerms(tap);
final int pos = tap[0].getPositionIncrement()-1;
mpqb.add(terms, pos); //AND logic in pos, OR across lines
}
doTestZeroPosIncrSloppy(mpqb.build(), 0);
mpqb.setSlop(2);
doTestZeroPosIncrSloppy(mpqb.build(), 0);
}
private Term[] tapTerms(Token[] tap) {
Term[] terms = new Term[tap.length];
for (int i=0; i<terms.length; i++) {
terms[i] = new Term("field",tap[i].toString());
}
return terms;
}
public void testNegativeSlop() throws Exception {
MultiPhraseQuery.Builder queryBuilder = new MultiPhraseQuery.Builder();
queryBuilder.add(new Term("field", "two"));
queryBuilder.add(new Term("field", "one"));
expectThrows(IllegalArgumentException.class, () -> {
queryBuilder.setSlop(-2);
});
}
}