blob: 47b770dfcac719bae9c9286f552458167cef03c9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/**
* Helps the {@link FieldOffsetStrategy} with position sensitive queries (e.g. highlight phrases correctly).
* This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
* documents. Despite this state, it's immutable after construction.
*
* @lucene.internal
*/
// TODO rename to SpanHighlighting ?
public class PhraseHelper {
public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
(s) -> false, spanQuery -> null, query -> null, true);
private final String fieldName;
private final Set<BytesRef> positionInsensitiveTerms; // (TermQuery terms)
private final Set<SpanQuery> spanQueries;
private final boolean willRewrite;
private final Predicate<String> fieldMatcher;
/**
* Constructor.
* {@code rewriteQueryPred} is an extension hook to override the default choice of
* {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten,
* so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten.
* Similarly, {@code preExtractRewriteFunction} is also an extension hook for extract to allow different queries
* to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked.
* {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is
* usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones.
* {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted.
*/
public PhraseHelper(Query query, String field, Predicate<String> fieldMatcher, Function<SpanQuery, Boolean> rewriteQueryPred,
Function<Query, Collection<Query>> preExtractRewriteFunction,
boolean ignoreQueriesNeedingRewrite) {
this.fieldName = field;
this.fieldMatcher = fieldMatcher;
// filter terms to those we want
positionInsensitiveTerms = new HashSet<>();
spanQueries = new HashSet<>();
// TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls
boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class
// For TermQueries or other position insensitive queries, collect the Terms.
// For other Query types, WSTE will convert to an equivalent SpanQuery. NOT extracting position spans here.
new WeightedSpanTermExtractor(field) {
//anonymous constructor
{
setExpandMultiTermQuery(true); //necessary for mustRewriteQuery(spanQuery) to work.
try {
extract(query, 1f, null); // null because we won't actually extract right now; we're not collecting
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> terms) throws IOException {
Collection<Query> newQueriesToExtract = preExtractRewriteFunction.apply(query);
if (newQueriesToExtract != null) {
for (Query newQuery : newQueriesToExtract) {
extract(newQuery, boost, terms);
}
} else {
super.extract(query, boost, terms);
}
}
@Override
protected boolean isQueryUnsupported(Class<? extends Query> clazz) {
if (clazz.isAssignableFrom(MultiTermQuery.class)) {
return true; //We do MTQ processing separately in MultiTermHighlighting.java
}
return true; //TODO set to false and provide a hook to customize certain queries.
}
// called on Query types that are NOT position sensitive, e.g. TermQuery
@Override
protected void extractWeightedTerms(Map<String, WeightedSpanTerm> terms, Query query, float boost) {
query.visit(new QueryVisitor() {
@Override
public boolean acceptField(String field) {
return fieldMatcher.test(field);
}
@Override
public void consumeTerms(Query query, Term... terms) {
for (Term term : terms) {
positionInsensitiveTerms.add(term.bytes());
}
}
});
}
// called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted beforehand
@Override
protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery,
float boost) throws IOException {
// if this span query isn't for this field, skip it.
Set<String> fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1
collectSpanQueryFields(spanQuery, fieldNameSet);
for (String spanField : fieldNameSet) {
if (!fieldMatcher.test(spanField)) {
return;
}
}
boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
if (ignoreQueriesNeedingRewrite && mustRewriteQuery) {
return;// ignore this query
}
mustRewriteHolder[0] |= mustRewriteQuery;
spanQueries.add(spanQuery);
}
@Override
protected boolean mustRewriteQuery(SpanQuery spanQuery) {
Boolean rewriteQ = rewriteQueryPred.apply(spanQuery);// allow to override
return rewriteQ != null ? rewriteQ : super.mustRewriteQuery(spanQuery);
}
}; // calling the constructor triggered the extraction/visiting we want. Hacky; yes.
willRewrite = mustRewriteHolder[0];
}
public Set<SpanQuery> getSpanQueries() {
return spanQueries;
}
/**
* If there is no position sensitivity then use of the instance of this class can be ignored.
*/
public boolean hasPositionSensitivity() {
return spanQueries.isEmpty() == false;
}
/**
* Rewrite is needed for handling a {@link SpanMultiTermQueryWrapper} (MTQ / wildcards) or some
* custom things. When true, the resulting term list will probably be different than what it was known
* to be initially.
*/
public boolean willRewrite() {
return willRewrite;
}
/** Returns the terms that are position-insensitive (sorted). */
public BytesRef[] getAllPositionInsensitiveTerms() {
BytesRef[] result = positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]);
Arrays.sort(result);
return result;
}
/** Given the internal SpanQueries, produce a number of OffsetsEnum into the {@code results} param. */
public void createOffsetsEnumsForSpans(LeafReader leafReader, int docId, List<OffsetsEnum> results) throws IOException {
leafReader = new SingleFieldWithOffsetsFilterLeafReader(leafReader, fieldName);
//TODO avoid searcher and do what it does to rewrite & get weight?
IndexSearcher searcher = new IndexSearcher(leafReader);
searcher.setQueryCache(null);
// for each SpanQuery, grab it's Spans and put it into a PriorityQueue
PriorityQueue<Spans> spansPriorityQueue = new PriorityQueue<Spans>(spanQueries.size()) {
@Override
protected boolean lessThan(Spans a, Spans b) {
return a.startPosition() <= b.startPosition();
}
};
for (Query query : spanQueries) {
Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer scorer = weight.scorer(leafReader.getContext());
if (scorer == null) {
continue;
}
TwoPhaseIterator twoPhaseIterator = scorer.twoPhaseIterator();
if (twoPhaseIterator != null) {
if (twoPhaseIterator.approximation().advance(docId) != docId || !twoPhaseIterator.matches()) {
continue;
}
} else if (scorer.iterator().advance(docId) != docId) { // preposition, and return doing nothing if find none
continue;
}
Spans spans = ((SpanScorer) scorer).getSpans();
assert spans.docID() == docId;
if (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
spansPriorityQueue.add(spans);
}
}
// Iterate the Spans in the PriorityQueue, collecting as we go. By using a PriorityQueue ordered by position,
// the underlying offsets in our collector will be mostly appended to the end of arrays (efficient).
// note: alternatively it'd interesting if we produced one OffsetsEnum that internally advanced
// this PriorityQueue when nextPosition is called; it would cap what we have to cache for large docs and
// exiting early (due to maxLen) is easy.
// But at least we have an accurate "freq" and it shouldn't be too much data to collect. Even SpanScorer
// navigates the spans fully to compute a good freq (and thus score)!
OffsetSpanCollector spanCollector = new OffsetSpanCollector();
while (spansPriorityQueue.size() > 0) {
Spans spans = spansPriorityQueue.top();
//TODO limit to a capped endOffset length somehow so we can break this loop early
spans.collect(spanCollector);
if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) {
spansPriorityQueue.pop();
} else {
spansPriorityQueue.updateTop();
}
}
results.addAll(spanCollector.termToOffsetsEnums.values());
}
/**
* Needed to support the ability to highlight a query irrespective of the field a query refers to
* (aka requireFieldMatch=false).
* This reader will just delegate every call to a single field in the wrapped
* LeafReader. This way we ensure that all queries going through this reader target the same field.
*/
private static final class SingleFieldWithOffsetsFilterLeafReader extends FilterLeafReader {
final String fieldName;
SingleFieldWithOffsetsFilterLeafReader(LeafReader in, String fieldName) {
super(in);
this.fieldName = fieldName;
}
@Override
public FieldInfos getFieldInfos() {
throw new UnsupportedOperationException();//TODO merge them
}
@Override
public Terms terms(String field) throws IOException {
// ensure the underlying PostingsEnum returns offsets. It's sad we have to do this to use the SpanCollector.
return new FilterTerms(super.terms(fieldName)) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(in.iterator()) {
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
return super.postings(reuse, flags | PostingsEnum.OFFSETS);
}
};
}
};
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
return super.getNormValues(fieldName);
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
private class OffsetSpanCollector implements SpanCollector {
Map<BytesRef, SpanCollectedOffsetsEnum> termToOffsetsEnums = new HashMap<>();
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
if (!fieldMatcher.test(term.field())) {
return;
}
SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
if (offsetsEnum == null) {
// If it's pos insensitive we handle it outside of PhraseHelper. term.field() is from the Query.
if (positionInsensitiveTerms.contains(term.bytes())) {
return;
}
offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
termToOffsetsEnums.put(term.bytes(), offsetsEnum);
}
offsetsEnum.add(postings.startOffset(), postings.endOffset());
}
@Override
public void reset() { // called when at a new position. We don't care.
}
}
private static class SpanCollectedOffsetsEnum extends OffsetsEnum {
// TODO perhaps optionally collect (and expose) payloads?
private final BytesRef term;
private final int[] startOffsets;
private final int[] endOffsets;
private int numPairs = 0;
private int enumIdx = -1;
private SpanCollectedOffsetsEnum(BytesRef term, int postingsFreq) {
this.term = term;
this.startOffsets = new int[postingsFreq]; // hopefully not wasteful? At least we needn't resize it.
this.endOffsets = new int[postingsFreq];
}
// called from collector before it's navigated
void add(int startOffset, int endOffset) {
assert enumIdx == -1 : "bad state";
// loop backwards since we expect a match at the end or close to it. We expect O(1) not O(N).
int pairIdx = numPairs - 1;
for (; pairIdx >= 0; pairIdx--) {
int iStartOffset = startOffsets[pairIdx];
int iEndOffset = endOffsets[pairIdx];
int cmp = Integer.compare(iStartOffset, startOffset);
if (cmp == 0) {
cmp = Integer.compare(iEndOffset, endOffset);
}
if (cmp == 0) {
return; // we already have this offset-pair for this term
} else if (cmp < 0) {
break; //we will insert offsetPair to the right of pairIdx
}
}
// pairIdx is now one position to the left of where we insert the new pair
// shift right any pairs by one to make room
final int shiftLen = numPairs - (pairIdx + 1);
if (shiftLen > 0) {
System.arraycopy(startOffsets, pairIdx + 1, startOffsets, pairIdx + 2, shiftLen);
System.arraycopy(endOffsets, pairIdx + 1, endOffsets, pairIdx + 2, shiftLen);
}
// now we can place the offset pair
startOffsets[pairIdx + 1] = startOffset;
endOffsets[pairIdx + 1] = endOffset;
numPairs++;
}
@Override
public boolean nextPosition() throws IOException {
return ++enumIdx < numPairs;
}
@Override
public int freq() throws IOException {
return numPairs;
}
@Override
public BytesRef getTerm() throws IOException {
return term;
}
@Override
public int startOffset() throws IOException {
return startOffsets[enumIdx];
}
@Override
public int endOffset() throws IOException {
return endOffsets[enumIdx];
}
}
}