blob: 9b3ebfcf3bab41a3f06be0afe005467d94bcd234 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Before;
import org.junit.Test;
public class TestMatchHighlighter extends LuceneTestCase {
private static final String FLD_ID = "id";
private static final String FLD_TEXT1 = "text1";
private static final String FLD_TEXT2 = "text2";
private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
private FieldType TYPE_TEXT_POSITIONS;
private PerFieldAnalyzerWrapper analyzer;
@Before
public void setup() throws IOException {
TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;
TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
TYPE_TEXT_POSITIONS_OFFSETS.freeze();
Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
// Create an analyzer with some synonyms, just to showcase them.
SynonymMap synonymMap =
buildSynonymMap(
new String[][] {
{"moon\u0000shine", "firewater"},
{"firewater", "moon\u0000shine"},
});
// Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
// glued together.
final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
final int positionGap = RandomizedTest.randomFrom(new int[] {0, 1, 100});
Analyzer synonymsAnalyzer =
new AnalyzerWithGaps(
offsetGap,
positionGap,
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer();
TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
return new TokenStreamComponents(tokenizer, tokenStream);
}
});
fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);
analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
}
static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
SynonymMap.Builder builder = new SynonymMap.Builder();
for (String[] pair : synonyms) {
assertThat(pair.length, Matchers.equalTo(2));
builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
}
return builder.build();
}
@Test
public void testBasicUsage() throws IOException {
new IndexBuilder(this::toField)
.doc(FLD_TEXT1, "foo bar baz")
.doc(FLD_TEXT1, "bar foo baz")
.doc(
fields -> {
fields.add(FLD_TEXT1, "Very long content but not matching anything.");
fields.add(FLD_TEXT2, "no foo but bar");
})
.build(
analyzer,
reader -> {
Query query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
.build();
// In the most basic scenario, we run a search against a query, retrieve
// top docs...
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
TopDocs topDocs = searcher.search(query, 10, sortOrder);
// ...and would want a fixed set of fields from those documents, some of them
// possibly highlighted if they matched the query.
//
// This configures the highlighter so that the FLD_ID field is always returned
// verbatim,
// and FLD_TEXT1 is returned *only if it contained a query match*.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
.appendFieldHighlighter(
FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
// Note document field highlights are a stream over documents in topDocs. In the
// remaining code we will just
// collect them on the fly into a preformatted string.
Stream<MatchHighlighter.DocHighlights> highlights =
highlighter.highlight(topDocs, query);
assertHighlights(
toDocList(highlights),
" 0. id: 0",
" text1: >foo< bar baz",
" 1. id: 1",
" text1: bar >foo< baz",
" 2. id: 2");
// In a more realistic use case, you'd want to show the value of a given field
// *regardless* of whether it
// contained a highlight or not -- it is odd that document "id: 2" above doesn't have
// the 'text1' field
// shown because that field wasn't part of the query match.
//
// Let's say the field is also potentially long; if it contains a match,
// we would want to display the contextual snippet surrounding that match. If it does
// not contain any
// matches, we would want to display its content up to a given number of characters
// (lead lines).
//
// Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
.appendFieldHighlighter(
FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
.appendFieldHighlighter(
FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(
toDocList(highlighter.highlight(topDocs, query)),
" 0. id: 0",
" text1: >foo< bar baz",
" 1. id: 1",
" text1: bar >foo< baz",
" 2. id: 2",
" text1: Very long...");
// Field highlighters can apply to multiple fields and be chained for convenience.
// For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
.or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
.appendFieldHighlighter(highlightedOrAbbreviated)
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(
toDocList(highlighter.highlight(topDocs, query)),
" 0. id: 0",
" text1: >foo< bar baz",
" 1. id: 1",
" text1: bar >foo< baz",
" 2. id: 2",
" text1: Very long...",
" text2: no foo but >bar<");
});
}
@Test
public void testSynonymHighlight() throws IOException {
// There is nothing special needed to highlight or process complex queries, synonyms, etc.
// Synonyms defined in the constructor of this class.
new IndexBuilder(this::toField)
.doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
.build(
analyzer,
reader -> {
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(
FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
assertHighlights(
toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
"0. text1: Where the >moon shine< falls, >firewater< flows.");
query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
assertHighlights(
toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
"0. text1: Where the >moon shine< falls, >firewater< flows.");
});
}
@Test
public void testCustomFieldHighlightHandling() throws IOException {
// Match highlighter is a showcase of individual components in this package, suitable
// to create any kind of field-display designs.
//
// In this example we will build a custom field highlighting handler that
// highlights matches over a multivalued field, shows that field's values if it received
// no matches and limits the number of values displayed to at most 2 (with an appropriate
// message).
new IndexBuilder(this::toField)
// Just one document, one field, four values.
.doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
.build(
analyzer,
reader -> {
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER;
// Let's start with the simple predefined highlighter so that the field's value shows
// and is highlighted when it was part of the hit.
MatchHighlighter.FieldValueHighlighter highlighted =
FieldValueHighlighters.highlighted(
80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(highlighted)
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
TopDocs topDocs = searcher.search(query, 10, sortOrder);
// Note the highlighter is configured with at most 2 snippets so the match on the
// third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
// values are displayed in full.
assertHighlights(
toDocList(highlighter.highlight(topDocs, query)),
"0. text1: >foo< bar, bar >foo< baz");
// So the above works fine if the field received a match but omits it otherwise. We
// can
// force the display of this field by chaining with verbatim value highlighter:
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(
highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(
toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
"0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");
// But this is not exactly what we'd like because we want to limit the display of
// values to the first two.
// Let's just write a custom field highlighter handler that does it.
class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
private final String field;
private final int limit;
AtMostNValuesHighlighter(String field, int limit) {
this.field = field;
this.limit = limit;
}
@Override
public boolean isApplicable(String field, boolean hasMatches) {
return Objects.equals(field, this.field);
}
@Override
public List<String> format(
String field,
String[] values,
String contiguousValue,
List<OffsetRange> valueRanges,
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
if (values.length <= limit) {
return Arrays.asList(values);
} else {
List<String> collected =
Stream.of(values).limit(limit).collect(Collectors.toList());
int remaining = values.length - collected.size();
collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
return collected;
}
}
@Override
public Collection<String> alwaysFetchedFields() {
return Collections.singleton(field);
}
}
// We can now chain it as usual and contemplate the result.
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(
highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(
toDocList(highlighter.highlight(topDocs, query)),
"0. text1: >foo< bar, bar >foo< baz");
assertHighlights(
toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
"0. text1: foo bar, bar foo baz, [2 omitted]");
});
}
@Test
public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
// Match highlighter underlying components are powerful enough to build interesting,
// if not always super-practical, things. In this case, we would like to highlight
// a set of matches of *more than one* query over the same set of input documents. This includes
// highest-scoring passage resolution (from multiple hits) and different highlight markers
// for each query.
new IndexBuilder(this::toField)
.doc(FLD_TEXT1, "foo bar baz")
.doc(FLD_TEXT1, "foo baz bar")
.build(
analyzer,
reader -> {
// Let's start with the two queries. The first one will be an unordered
// query for (foo, baz) with a max gap of 1; let's use intervals for this.
Query q1 =
new IntervalQuery(
FLD_TEXT1,
Intervals.maxgaps(
1, Intervals.unordered(Intervals.term("foo"), Intervals.term("baz"))));
// The second one will be a simpler term query for "bar".
Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));
// Let's fetch matching documents by combining the two into a Boolean query.
Query query =
new BooleanQuery.Builder()
.add(q1, BooleanClause.Occur.SHOULD)
.add(q2, BooleanClause.Occur.SHOULD)
.build();
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
TopDocs topDocs = searcher.search(query, 10, sortOrder);
// If we use the "regular" highlighter, the result will be slightly odd: a nested
// highlight over "bar" within the first match. Also, you can't distinguish which of
// the sub-queries
// caused which highlight marker... but if it were HTML then you could give the span
// some semi-translucent background and layered matches would be visible.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(
FieldValueHighlighters.highlighted(
80 * 3,
1,
new PassageFormatter("...", "<span>", "</span>"),
FLD_TEXT1::equals))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
assertHighlights(
toDocList(highlighter.highlight(topDocs, query)),
"0. text1: <span>foo <span>bar</span> baz</span>",
"1. text1: <span>foo baz</span> <span>bar</span>");
// To separate highlights for multiple queries we'll pass them separately to the
// highlighter and differentiate highlight markers upon their application. Let's start
// with the customized
// field highlighter first. This utilizes the fact that match ranges passed from
// MatchHighlighter
// contain a reference to the original query which brought up the match.
class SeparateMarkerFieldHighlighter
implements MatchHighlighter.FieldValueHighlighter {
private final String field;
private final Map<Query, String> queryClassMap;
SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
this.field = field;
this.queryClassMap = queryClassMap;
}
@Override
public boolean isApplicable(String field, boolean hasMatches) {
return Objects.equals(field, this.field) && hasMatches;
}
@Override
public List<String> format(
String field,
String[] values,
String contiguousValue,
List<OffsetRange> valueRanges,
List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
PassageSelector passageSelector = new PassageSelector();
int maxPassageWindow = 80;
int maxPassages = 3;
List<Passage> bestPassages =
passageSelector.pickBest(
contiguousValue,
matchOffsets,
maxPassageWindow,
maxPassages,
valueRanges);
// We know the offset ranges passed to us by MatchHighlighter are instances of
// QueryOffsetRange
// so we compute the class based on that.
Function<OffsetRange, String> queryToClass =
(range) ->
queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);
PassageFormatter passageFormatter =
new PassageFormatter(
"...",
(range) -> "<span class='" + queryToClass.apply(range) + "'>",
(range) -> "</span>");
return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
}
}
// And this is pretty much it. We now set up query classes to display, set up the
// highlighter...
Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(
new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
// ...and run highlighting. Note the query passed to the highlighter are individual
// sub-clauses
// of the Boolean query used to fetch documents.
assertHighlights(
toDocList(highlighter.highlight(topDocs, q1, q2)),
"0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
"1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
});
}
private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
ArrayList<String> actualLines = new ArrayList<>();
for (int doc = 0; doc < docList.size(); doc++) {
List<String> fields = docList.get(doc);
for (int i = 0; i < fields.size(); i++) {
actualLines.add(
(i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : " ") + fields.get(i));
}
}
if (!Arrays.equals(
Stream.of(expectedFormattedLines).map(String::trim).toArray(),
actualLines.stream().map(String::trim).toArray())) {
throw new AssertionError(
"Actual hits were:\n"
+ String.join("\n", actualLines)
+ "\n\n but expected them to be:\n"
+ String.join("\n", expectedFormattedLines));
}
}
private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
return highlights
.map(
docHighlights ->
docHighlights.fields.entrySet().stream()
.map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
.collect(Collectors.toList()))
.collect(Collectors.toList());
}
private IndexableField toField(String name, String value) {
switch (name) {
case FLD_TEXT1:
return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
case FLD_TEXT2:
return new Field(name, value, TYPE_TEXT_POSITIONS);
default:
throw new AssertionError("Don't know how to handle this field: " + name);
}
}
}