blob: 75758a32a0218337397e2ccbd93c99141568120e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestUnifiedHighlighterRanking extends LuceneTestCase {
Analyzer indexAnalyzer;
// note: all offset sources, by default, use term freq, so it shouldn't matter which we choose.
final FieldType fieldType = UHTestHelper.randomFieldType(random());
/**
* indexes a bunch of gibberish, and then highlights top(n).
* asserts that top(n) highlights is a subset of top(n+1) up to some max N
*/
// TODO: this only tests single-valued fields. we should also index multiple values per field!
public void testRanking() throws Exception {
// number of documents: we will check each one
final int numDocs = atLeast(20);
// number of top-N snippets, we will check 1 .. N
final int maxTopN = 3;
// maximum number of elements to put in a sentence.
final int maxSentenceLength = 10;
// maximum number of sentences in a document
final int maxNumSentences = 20;
Directory dir = newDirectory();
indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
Document document = new Document();
Field id = new StringField("id", "", Field.Store.NO);
Field body = new Field("body", "", fieldType);
document.add(id);
document.add(body);
for (int i = 0; i < numDocs; i++) {
StringBuilder bodyText = new StringBuilder();
int numSentences = TestUtil.nextInt(random(), 1, maxNumSentences);
for (int j = 0; j < numSentences; j++) {
bodyText.append(newSentence(random(), maxSentenceLength));
}
body.setStringValue(bodyText.toString());
id.setStringValue(Integer.toString(i));
iw.addDocument(document);
}
IndexReader ir = iw.getReader();
IndexSearcher searcher = newSearcher(ir);
for (int i = 0; i < numDocs; i++) {
checkDocument(searcher, i, maxTopN);
}
iw.close();
ir.close();
dir.close();
}
private void checkDocument(IndexSearcher is, int doc, int maxTopN) throws IOException {
for (int ch = 'a'; ch <= 'z'; ch++) {
Term term = new Term("body", "" + (char) ch);
// check a simple term query
checkQuery(is, new TermQuery(term), doc, maxTopN);
// check a boolean query
Term nextTerm = new Term("body", "" + (char) (ch + 1));
BooleanQuery bq = new BooleanQuery.Builder()
.add(new TermQuery(term), BooleanClause.Occur.SHOULD)
.add(new TermQuery(nextTerm), BooleanClause.Occur.SHOULD)
.build();
checkQuery(is, bq, doc, maxTopN);
}
}
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
for (int n = 1; n < maxTopN; n++) {
final FakePassageFormatter f1 = new FakePassageFormatter();
UnifiedHighlighter p1 = new UnifiedHighlighter(is, indexAnalyzer) {
@Override
protected PassageFormatter getFormatter(String field) {
assertEquals("body", field);
return f1;
}
};
p1.setMaxLength(Integer.MAX_VALUE - 1);
final FakePassageFormatter f2 = new FakePassageFormatter();
UnifiedHighlighter p2 = new UnifiedHighlighter(is, indexAnalyzer) {
@Override
protected PassageFormatter getFormatter(String field) {
assertEquals("body", field);
return f2;
}
};
p2.setMaxLength(Integer.MAX_VALUE - 1);
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
queryBuilder.add(query, BooleanClause.Occur.MUST);
queryBuilder.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
BooleanQuery bq = queryBuilder.build();
TopDocs td = is.search(bq, 1);
p1.highlight("body", bq, td, n);
p2.highlight("body", bq, td, n + 1);
assertTrue(f2.seen.containsAll(f1.seen));
}
}
/**
* returns a new random sentence, up to maxSentenceLength "words" in length.
* each word is a single character (a-z). The first one is capitalized.
*/
private String newSentence(Random r, int maxSentenceLength) {
StringBuilder sb = new StringBuilder();
int numElements = TestUtil.nextInt(r, 1, maxSentenceLength);
for (int i = 0; i < numElements; i++) {
if (sb.length() > 0) {
sb.append(' ');
sb.append((char) TestUtil.nextInt(r, 'a', 'z'));
} else {
// capitalize the first word to help breakiterator
sb.append((char) TestUtil.nextInt(r, 'A', 'Z'));
}
}
sb.append(". "); // finalize sentence
return sb.toString();
}
/**
* a fake formatter that doesn't actually format passages.
* instead it just collects them for asserts!
*/
static class FakePassageFormatter extends PassageFormatter {
HashSet<Pair> seen = new HashSet<>();
@Override
public String format(Passage passages[], String content) {
for (Passage p : passages) {
// verify some basics about the passage
assertTrue(p.getScore() >= 0);
assertTrue(p.getNumMatches() > 0);
assertTrue(p.getStartOffset() >= 0);
assertTrue(p.getStartOffset() <= content.length());
assertTrue(p.getEndOffset() >= p.getStartOffset());
assertTrue(p.getEndOffset() <= content.length());
// we use a very simple analyzer. so we can assert the matches are correct
int lastMatchStart = -1;
for (int i = 0; i < p.getNumMatches(); i++) {
BytesRef term = p.getMatchTerms()[i];
int matchStart = p.getMatchStarts()[i];
assertTrue(matchStart >= 0);
// must at least start within the passage
assertTrue(matchStart < p.getEndOffset());
int matchEnd = p.getMatchEnds()[i];
assertTrue(matchEnd >= 0);
// always moving forward
assertTrue(matchStart >= lastMatchStart);
lastMatchStart = matchStart;
// single character terms
assertEquals(matchStart + 1, matchEnd);
// and the offsets must be correct...
assertEquals(1, term.length);
assertEquals((char) term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart)));
}
// record just the start/end offset for simplicity
seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
}
return "bogus!!!!!!";
}
}
static class Pair {
final int start;
final int end;
Pair(int start, int end) {
this.start = start;
this.end = end;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + end;
result = prime * result + start;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
Pair other = (Pair) obj;
if (end != other.end) {
return false;
}
if (start != other.start) {
return false;
}
return true;
}
@Override
public String toString() {
return "Pair [start=" + start + ", end=" + end + "]";
}
}
/**
* sets b=0 to disable passage length normalization
*/
public void testCustomB() throws Exception {
Directory dir = newDirectory();
indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(indexAnalyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Field body = new Field("body", "", fieldType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " +
"you have no idea how painful it was for me to type this long sentence into my IDE.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Set<HighlightFlag> getFlags(String field) {
if (random().nextBoolean()) {
return EnumSet.of(HighlightFlag.MULTI_TERM_QUERY, HighlightFlag.PHRASES, HighlightFlag.WEIGHT_MATCHES);
} else {
return super.getFlags(field);
}
}
@Override
protected PassageScorer getScorer(String field) {
return new PassageScorer(1.2f, 0, 87);
}
};
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits.value);
String snippets[] = highlighter.highlight("body", query, topDocs, 1);
assertEquals(1, snippets.length);
assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>"));
ir.close();
dir.close();
}
/**
* sets k1=0 for simple coordinate-level match (# of query terms present)
*/
public void testCustomK1() throws Exception {
Directory dir = newDirectory();
indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = newIndexWriterConfig(indexAnalyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Field body = new Field("body", "", fieldType);
Document doc = new Document();
doc.add(body);
body.setStringValue("This has only foo foo. " +
"On the other hand this sentence contains both foo and bar. " +
"This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Set<HighlightFlag> getFlags(String field) {
if (random().nextBoolean()) {
return EnumSet.of(HighlightFlag.MULTI_TERM_QUERY, HighlightFlag.PHRASES, HighlightFlag.WEIGHT_MATCHES);
} else {
return super.getFlags(field);
}
}
@Override
protected PassageScorer getScorer(String field) {
return new PassageScorer(0, 0.75f, 87);
}
};
BooleanQuery query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD)
.build();
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits.value);
String snippets[] = highlighter.highlight("body", query, topDocs, 1);
assertEquals(1, snippets.length);
assertTrue(snippets[0].startsWith("On the other hand"));
ir.close();
dir.close();
}
}