blob: 378c557707c3418017b171bb00eb3fd711de08fa [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Predicate;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
/**
* An example highlighter that combines several lower-level highlighting utilities in this package
* into a fully featured, ready-to-use component.
*
* <p>Note that if you need to customize or tweak the details of highlighting, it is better to
* assemble your own highlighter using those low-level building blocks, rather than extend or modify
* this one.
*/
public class MatchHighlighter {
private final IndexSearcher searcher;
private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
private final Analyzer analyzer;
private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();
/**
* Actual per-field highlighter. Field highlighters are probed whether they are applicable to a
* particular combination of (field, hasMatches) pair. If a highlighter declares it is applicable,
* its {@link #format} method is invoked and the result is returned as the field's value.
*
* @see FieldValueHighlighters
*/
public interface FieldValueHighlighter {
/**
* Check if this highlighter can be applied to a given field.
*
* @param field Field name
* @param hasMatches {@code true} if the field has a non-empty set of match regions.
*/
boolean isApplicable(String field, boolean hasMatches);
/** Do format field values appropriately. */
List<String> format(
String field,
String[] values,
String contiguousValue,
List<OffsetRange> valueRanges,
List<QueryOffsetRange> matchOffsets);
/**
* @return Returns a set of fields that must be fetched for each document, regardless of whether
* they had matches or not. This is useful to load and return certain fields that should
* always be included (identifiers, document titles, etc.).
*/
default Collection<String> alwaysFetchedFields() {
return Collections.emptyList();
}
/** Returns a new field value highlighter that is a combination of this one and another one. */
default FieldValueHighlighter or(FieldValueHighlighter other) {
FieldValueHighlighter first = this;
FieldValueHighlighter second = other;
HashSet<String> fieldUnion = new HashSet<>();
fieldUnion.addAll(first.alwaysFetchedFields());
fieldUnion.addAll(second.alwaysFetchedFields());
return new FieldValueHighlighter() {
@Override
public boolean isApplicable(String field, boolean hasMatches) {
return first.isApplicable(field, hasMatches) || second.isApplicable(field, hasMatches);
}
@Override
public List<String> format(
String field,
String[] values,
String contiguousValue,
List<OffsetRange> valueRanges,
List<QueryOffsetRange> matchOffsets) {
FieldValueHighlighter delegate =
first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty())
? first
: second;
return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
}
@Override
public Collection<String> alwaysFetchedFields() {
return fieldUnion;
}
};
}
}
/**
* Append a new highlighter to field highlighters chain. The order of field highlighters is
* important (first-matching wins).
*/
public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
fieldHighlighters.add(highlighter);
fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
return this;
}
/** Always fetch the given set of fields for all input documents. */
public void alwaysFetchFields(String... fields) {
for (String fld : fields) {
fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
}
}
/** Single document's highlights. */
public static class DocHighlights {
public final int docId;
public final Map<String, List<String>> fields = new LinkedHashMap<>();
public DocHighlights(int docId) {
this.docId = docId;
}
}
/** An {@link OffsetRange} of a match, together with the source query that caused it. */
public static class QueryOffsetRange extends OffsetRange {
public final Query query;
QueryOffsetRange(Query query, int from, int to) {
super(from, to);
this.query = query;
}
@Override
public QueryOffsetRange slice(int from, int to) {
return new QueryOffsetRange(query, from, to);
}
}
private static class DocHit {
final int docId;
private final LeafReader leafReader;
private final int leafDocId;
private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges = new LinkedHashMap<>();
DocHit(int docId, LeafReader leafReader, int leafDocId) {
this.docId = docId;
this.leafReader = leafReader;
this.leafDocId = leafDocId;
}
void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
hits.forEach(
(field, offsets) -> {
List<QueryOffsetRange> target =
matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
});
}
Document document(Predicate<String> needsField) throws IOException {
// Only load the fields that have a chance to be highlighted.
DocumentStoredFieldVisitor visitor =
new DocumentStoredFieldVisitor() {
@Override
public Status needsField(FieldInfo fieldInfo) {
return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name))
? Status.YES
: Status.NO;
}
};
leafReader.document(leafDocId, visitor);
return visitor.getDocument();
}
}
public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
this(
searcher,
analyzer,
MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
public MatchHighlighter(
IndexSearcher searcher,
Analyzer analyzer,
OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
this.searcher = searcher;
this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
this.analyzer = analyzer;
}
public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
// We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
// for streaming, so we'll just prepopulate the map in proper order.
LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
docHits.put(scoreDoc.doc, null);
}
// Collect match ranges for each query and associate each range to the origin query.
for (Query q : queries) {
MatchRegionRetriever highlighter =
new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
highlighter.highlightDocuments(
topDocs,
(int docId,
LeafReader leafReader,
int leafDocId,
Map<String, List<OffsetRange>> hits) -> {
DocHit docHit = docHits.get(docId);
if (docHit == null) {
docHit = new DocHit(docId, leafReader, leafDocId);
docHits.put(docId, docHit);
}
docHit.addMatches(q, hits);
});
}
return docHits.values().stream()
.filter(Objects::nonNull) // This should always the case?
.map(this::computeDocFieldValues);
}
private DocHighlights computeDocFieldValues(DocHit docHit) {
Document doc;
try {
doc = docHit.document(fieldsAlwaysReturned::contains);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
DocHighlights docHighlights = new DocHighlights(docHit.docId);
HashSet<String> unique = new HashSet<>();
for (IndexableField indexableField : doc) {
String field = indexableField.name();
if (!unique.add(field)) {
continue;
}
String[] values = doc.getValues(field);
String contiguousValue = contiguousFieldValue(field, values);
List<OffsetRange> valueRanges = computeValueRanges(field, values);
List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
List<String> formattedValues =
fieldValueHighlighter(field, offsets != null)
.format(field, values, contiguousValue, valueRanges, offsets);
if (formattedValues != null) {
docHighlights.fields.put(field, formattedValues);
}
}
return docHighlights;
}
private List<OffsetRange> computeValueRanges(String field, String[] values) {
ArrayList<OffsetRange> valueRanges = new ArrayList<>();
int offset = 0;
for (CharSequence v : values) {
valueRanges.add(new OffsetRange(offset, offset + v.length()));
offset += v.length();
offset += analyzer.getOffsetGap(field);
}
return valueRanges;
}
private String contiguousFieldValue(String field, String[] values) {
String value;
if (values.length == 1) {
value = values[0];
} else {
// TODO: This can be inefficient if offset gap is large but the logic
// of applying offsets would get much more complicated so leaving for now
// (would have to recalculate all offsets to omit gaps).
String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
value = String.join(fieldGapPadding, values);
}
return value;
}
private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
for (FieldValueHighlighter highlighter : fieldHighlighters) {
if (highlighter.isApplicable(field, hasMatches)) {
return highlighter;
}
}
throw new RuntimeException("No field highlighter could be matched to field: " + field);
}
}