blob: 60d9349fbf320ed1c10c958583911b1254c1602a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.highlight;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
public class HighlighterPhraseTest extends LuceneTestCase {
private static final String FIELD = "text";
public void testConcurrentPhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox jumped";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, new TokenStreamConcurrent(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "fox", "jumped");
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits.value);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream =
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
public void testConcurrentSpan() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox jumped";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, new TokenStreamConcurrent(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD, "fox")),
new SpanTermQuery(new Term(FIELD, "jumped")) }, 0, true);
final FixedBitSet bitset = new FixedBitSet(indexReader.maxDoc());
indexSearcher.search(phraseQuery, new SimpleCollector() {
private int baseDoc;
@Override
public void collect(int i) {
bitset.set(this.baseDoc + i);
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
this.baseDoc = context.docBase;
}
@Override
public void setScorer(Scorable scorer) {
// Do Nothing
}
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
});
assertEquals(1, bitset.cardinality());
final int maxDoc = indexReader.maxDoc();
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
for (int position = bitset.nextSetBit(0); position < maxDoc-1; position = bitset
.nextSetBit(position + 1)) {
assertEquals(0, position);
final TokenStream tokenStream =
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(position), -1);
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
}
} finally {
indexReader.close();
directory.close();
}
}
public void testSparsePhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, new TokenStreamSparse(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final PhraseQuery phraseQuery = new PhraseQuery(FIELD, "did", "jump");
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(0, hits.totalHits.value);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream =
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
public void testSparsePhraseWithNoPositions() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, TEXT, customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final PhraseQuery phraseQuery = new PhraseQuery(1, FIELD, "did", "jump");
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(1, hits.totalHits.value);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream =
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
public void testSparseSpan() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox did not jump";
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
try {
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectorOffsets(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectors(true);
document.add(new Field(FIELD, new TokenStreamSparse(), customType));
indexWriter.addDocument(document);
} finally {
indexWriter.close();
}
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
new SpanTermQuery(new Term(FIELD, "did")),
new SpanTermQuery(new Term(FIELD, "jump")) }, 0, true);
TopDocs hits = indexSearcher.search(phraseQuery, 1);
assertEquals(0, hits.totalHits.value);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
final TokenStream tokenStream =
TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
indexReader.close();
directory.close();
}
}
//shows the need to sum the increments in WeightedSpanTermExtractor
public void testStopWords() throws IOException, InvalidTokenOffsetsException {
MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
MockTokenFilter.ENGLISH_STOPSET);
final String TEXT = "the ab the the cd the the the ef the";
final Directory directory = newDirectory();
try (IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(stopAnalyzer))) {
final Document document = new Document();
document.add(newTextField(FIELD, TEXT, Store.YES));
indexWriter.addDocument(document);
}
try (IndexReader indexReader = DirectoryReader.open(directory)) {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
//equivalent of "ab the the cd the the the ef"
final PhraseQuery phraseQuery = new PhraseQuery.Builder()
.add(new Term(FIELD, "ab"), 0)
.add(new Term(FIELD, "cd"), 3)
.add(new Term(FIELD, "ef"), 7).build();
TopDocs hits = indexSearcher.search(phraseQuery, 100);
assertEquals(1, hits.totalHits.value);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
assertEquals(1, highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10).length);
} finally {
directory.close();
}
}
//shows the need to require inOrder if getSlop() == 0, not if final slop == 0
//in WeightedSpanTermExtractor
public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsException {
MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
MockTokenFilter.ENGLISH_STOPSET);
final String TEXT = "the cd the ab the the the the the the the ab the cd the";
final Directory directory = newDirectory();
try (IndexWriter indexWriter = new IndexWriter(directory,
newIndexWriterConfig(stopAnalyzer))) {
final Document document = new Document();
document.add(newTextField(FIELD, TEXT, Store.YES));
indexWriter.addDocument(document);
}
try (IndexReader indexReader = DirectoryReader.open(directory)) {
assertEquals(1, indexReader.numDocs());
final IndexSearcher indexSearcher = newSearcher(indexReader);
//equivalent of "ab the cd"
final PhraseQuery phraseQuery = new PhraseQuery.Builder()
.add(new Term(FIELD, "ab"), 0)
.add(new Term(FIELD, "cd"), 2).build();
TopDocs hits = indexSearcher.search(phraseQuery, 100);
assertEquals(1, hits.totalHits.value);
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
String[] frags = highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10);
assertEquals(1, frags.length);
assertTrue("contains <B>ab</B> the <B>cd</B>",
(frags[0].contains("<B>ab</B> the <B>cd</B>")));
assertTrue("does not contain <B>cd</B> the <B>ab</B>",
(!frags[0].contains("<B>cd</B> the <B>ab</B>")));
} finally {
directory.close();
}
}
private static final class TokenStreamSparse extends TokenStream {
private Token[] tokens;
private int i = -1;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
public TokenStreamSparse() {
reset();
}
@Override
public boolean incrementToken() {
this.i++;
if (this.i >= this.tokens.length) {
return false;
}
clearAttributes();
termAttribute.setEmpty().append(this.tokens[i]);
offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
.endOffset());
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
.getPositionIncrement());
return true;
}
@Override
public void reset() {
this.i = -1;
this.tokens = new Token[] {
new Token("the", 0, 3),
new Token("fox", 4, 7),
new Token("did", 8, 11),
new Token("jump", 16, 20) };
this.tokens[3].setPositionIncrement(2);
}
}
private static final class TokenStreamConcurrent extends TokenStream {
private Token[] tokens;
private int i = -1;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
public TokenStreamConcurrent() {
reset();
}
@Override
public boolean incrementToken() {
this.i++;
if (this.i >= this.tokens.length) {
return false;
}
clearAttributes();
termAttribute.setEmpty().append(this.tokens[i]);
offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
.endOffset());
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
.getPositionIncrement());
return true;
}
@Override
public void reset() {
this.i = -1;
this.tokens = new Token[] {
new Token("the", 0, 3),
new Token("fox", 4, 7),
new Token("jump", 8, 14),
new Token("jumped", 8, 14) };
this.tokens[3].setPositionIncrement(0);
}
}
}