blob: b71f37a27a63ae9fef2cd5ddfde2e4259d8adf1a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.matchhighlight;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.PrimitiveIterator;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
/**
* Utility class to compute a list of "match regions" for a given query, searcher and document(s)
* using {@link Matches} API.
*/
public class MatchRegionRetriever {
private final List<LeafReaderContext> leaves;
private final Weight weight;
private final TreeSet<String> affectedFields;
private final Map<String, OffsetsRetrievalStrategy> offsetStrategies;
private final Set<String> preloadFields;
/**
* A callback for accepting a single document (and its associated leaf reader, leaf document ID)
* and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the
* query.
*/
@FunctionalInterface
public interface MatchOffsetsConsumer {
void accept(
int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
throws IOException;
}
/**
* An abstraction that provides document values for a given field. Default implementation in
* {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is possible
* to write a more efficient implementation on top of a reusable character buffer (that reuses the
* buffer while retrieving hit regions for documents).
*/
@FunctionalInterface
public interface FieldValueProvider {
List<CharSequence> getValues(String field);
}
/**
* A constructor with the default offset strategy supplier.
*
* @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the
* absence of position offsets in the index. Note that the analyzer must return tokens
* (positions and offsets) identical to the ones stored in the index.
*/
public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer)
throws IOException {
this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
}
/**
* @param searcher Index searcher to be used for retrieving matches.
* @param query The query for which matches should be retrieved. The query should be rewritten
* against the provided searcher.
* @param fieldOffsetStrategySupplier A custom supplier of per-field {@link
* OffsetsRetrievalStrategy} instances.
*/
public MatchRegionRetriever(
IndexSearcher searcher,
Query query,
OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
throws IOException {
leaves = searcher.getIndexReader().leaves();
assert checkOrderConsistency(leaves);
// We need full scoring mode so that we can receive matches from all sub-clauses
// (no optimizations in Boolean queries take place).
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);
// Compute the subset of fields affected by this query so that we don't load or scan
// fields that are irrelevant.
affectedFields = new TreeSet<>();
query.visit(
new QueryVisitor() {
@Override
public boolean acceptField(String field) {
affectedFields.add(field);
return false;
}
});
// Compute value offset retrieval strategy for all affected fields.
offsetStrategies = new HashMap<>();
for (String field : affectedFields) {
offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
}
// Ask offset strategies if they'll need field values.
preloadFields = new HashSet<>();
offsetStrategies.forEach(
(field, strategy) -> {
if (strategy.requiresDocument()) {
preloadFields.add(field);
}
});
// Only preload those field values that can be affected by the query and are required
// by strategies.
preloadFields.retainAll(affectedFields);
}
public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer)
throws IOException {
highlightDocuments(
Arrays.stream(topDocs.scoreDocs).mapToInt(scoreDoc -> scoreDoc.doc).sorted().iterator(),
consumer);
}
/**
* Low-level, high-efficiency method for highlighting large numbers of documents at once in a
* streaming fashion.
*
* @param docIds A stream of <em>sorted</em> document identifiers for which hit ranges should be
* returned.
* @param consumer A streaming consumer for document-hits pairs.
*/
public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
throws IOException {
if (leaves.isEmpty()) {
return;
}
Iterator<LeafReaderContext> ctx = leaves.iterator();
LeafReaderContext currentContext = ctx.next();
int previousDocId = -1;
Map<String, List<OffsetRange>> highlights = new TreeMap<>();
while (docIds.hasNext()) {
int docId = docIds.nextInt();
if (docId < previousDocId) {
throw new RuntimeException("Input document IDs must be sorted (increasing).");
}
previousDocId = docId;
while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
currentContext = ctx.next();
}
int contextRelativeDocId = docId - currentContext.docBase;
// Only preload fields we may potentially need.
FieldValueProvider documentSupplier;
if (preloadFields.isEmpty()) {
documentSupplier = null;
} else {
Document doc = currentContext.reader().document(contextRelativeDocId, preloadFields);
documentSupplier = new DocumentFieldValueProvider(doc);
}
highlights.clear();
highlightDocument(
currentContext, contextRelativeDocId, documentSupplier, (field) -> true, highlights);
consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
}
}
/**
* Low-level method for retrieving hit ranges for a single document. This method can be used with
* custom document {@link FieldValueProvider}.
*/
public void highlightDocument(
LeafReaderContext leafReaderContext,
int contextDocId,
FieldValueProvider doc,
Predicate<String> acceptField,
Map<String, List<OffsetRange>> outputHighlights)
throws IOException {
Matches matches = weight.matches(leafReaderContext, contextDocId);
if (matches == null) {
return;
}
for (String field : affectedFields) {
if (acceptField.test(field)) {
MatchesIterator matchesIterator = matches.getMatches(field);
if (matchesIterator == null) {
// No matches on this field, even though the field was part of the query. This may be
// possible
// with complex queries that source non-text fields (have no "hit regions" in any textual
// representation). Skip.
} else {
OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
if (offsetStrategy == null) {
throw new IOException(
"Non-empty matches but no offset retrieval strategy for field: " + field);
}
List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
if (!ranges.isEmpty()) {
outputHighlights.put(field, ranges);
}
}
}
}
}
private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
for (int i = 1; i < leaves.size(); i++) {
LeafReaderContext prev = leaves.get(i - 1);
LeafReaderContext next = leaves.get(i);
assert prev.docBase <= next.docBase;
assert prev.docBase + prev.reader().maxDoc() == next.docBase;
}
return true;
}
/**
* Compute default strategies for retrieving offsets from {@link MatchesIterator} instances for a
* set of given fields.
*/
public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
IndexReader reader, Analyzer analyzer) {
FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
return (field) -> {
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
return (mi, doc) -> {
throw new IOException("FieldInfo is null for field: " + field);
};
}
switch (fieldInfo.getIndexOptions()) {
case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
return new OffsetsFromMatchIterator(field);
case DOCS_AND_FREQS_AND_POSITIONS:
return new OffsetsFromPositions(field, analyzer);
case DOCS_AND_FREQS:
case DOCS:
// By default retrieve offsets from individual tokens
// retrieved by the analyzer (possibly narrowed down to
// only those terms that the query hinted at when passed
// a QueryVisitor.
//
// Alternative strategies are also possible and may make sense
// depending on the use case (OffsetsFromValues, for example).
return new OffsetsFromTokens(field, analyzer);
default:
return (matchesIterator, doc) -> {
throw new IOException(
"Field is indexed without positions and/or offsets: "
+ field
+ ", "
+ fieldInfo.getIndexOptions());
};
}
};
}
/** Implements {@link FieldValueProvider} wrapping a preloaded {@link Document}. */
private static final class DocumentFieldValueProvider implements FieldValueProvider {
private final Document doc;
public DocumentFieldValueProvider(Document doc) {
this.doc = doc;
}
@Override
public List<CharSequence> getValues(String field) {
return Arrays.asList(doc.getValues(field));
}
}
}