blob: d3998833f46fe314b346f6adbf8ddc2c59e3c738 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.function.Supplier;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
/**
* An enumeration/iterator of a term and its offsets for use by {@link FieldHighlighter}.
* It is advanced and is placed in a priority queue by
* {@link FieldHighlighter#highlightOffsetsEnums(OffsetsEnum)} based on the start offset.
*
* @lucene.internal
*/
public abstract class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
// note: the ordering clearly changes as the postings enum advances
// note: would be neat to use some Comparator utilities with method
// references but our methods throw IOException
@Override
public int compareTo(OffsetsEnum other) {
try {
int cmp = Integer.compare(startOffset(), other.startOffset());
if (cmp != 0) {
return cmp; // vast majority of the time we return here.
}
cmp = Integer.compare(endOffset(), other.endOffset());
if (cmp != 0) {
return cmp;
}
final BytesRef thisTerm = this.getTerm();
final BytesRef otherTerm = other.getTerm();
if (thisTerm == null || otherTerm == null) {
if (thisTerm == null && otherTerm == null) {
return 0;
} else if (thisTerm == null) {
return 1; // put "this" (wildcard mtq enum) last
} else {
return -1;
}
}
return thisTerm.compareTo(otherTerm);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Advances to the next position and returns true, or if can't then returns false.
* Note that the initial state of this class is not positioned.
*/
public abstract boolean nextPosition() throws IOException;
/** An estimate of the number of occurrences of this term/OffsetsEnum. */
public abstract int freq() throws IOException;
/**
* The term at this position.
* This BytesRef is safe to continue to refer to, even after we move to the next position.
*
* @see Passage#getMatchTerms()
*/
public abstract BytesRef getTerm() throws IOException;
public abstract int startOffset() throws IOException;
public abstract int endOffset() throws IOException;
@Override
public void close() throws IOException {
}
@Override
public String toString() {
final String name = getClass().getSimpleName();
String offset = "";
try {
offset = ",[" + startOffset() + "-" + endOffset() + "]";
} catch (Exception e) {
//ignore; for debugging only
}
try {
return name + "(term:" + getTerm().utf8ToString() + offset + ")";
} catch (Exception e) {
return name;
}
}
/**
* Based on a {@link PostingsEnum} -- the typical/standard OE impl.
*/
public static class OfPostings extends OffsetsEnum {
private final BytesRef term;
private final PostingsEnum postingsEnum; // with offsets
private final int freq;
private int posCounter = -1;
public OfPostings(BytesRef term, int freq, PostingsEnum postingsEnum) throws IOException {
this.term = Objects.requireNonNull(term);
this.postingsEnum = Objects.requireNonNull(postingsEnum);
this.freq = freq;
this.posCounter = this.postingsEnum.freq();
}
public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException {
this(term, postingsEnum.freq(), postingsEnum);
}
public PostingsEnum getPostingsEnum() {
return postingsEnum;
}
@Override
public boolean nextPosition() throws IOException {
if (posCounter > 0) {
posCounter--;
postingsEnum.nextPosition(); // note: we don't need to save the position
return true;
} else {
return false;
}
}
@Override
public BytesRef getTerm() throws IOException {
return term;
}
@Override
public int startOffset() throws IOException {
return postingsEnum.startOffset();
}
@Override
public int endOffset() throws IOException {
return postingsEnum.endOffset();
}
@Override
public int freq() throws IOException {
return freq;
}
}
/** Based on a {@link MatchesIterator} with submatches. */
public static class OfMatchesIteratorWithSubs extends OffsetsEnum {
//Either CachedOE impls (which are the submatches) or OfMatchesIterator impls
private final PriorityQueue<OffsetsEnum> pendingQueue = new PriorityQueue<>();
private final HashMap<Query,BytesRef> queryToTermMap = new HashMap<>();
public OfMatchesIteratorWithSubs(MatchesIterator matchesIterator) {
pendingQueue.add(new OfMatchesIterator(matchesIterator, () -> queryToTerm(matchesIterator.getQuery())));
}
@Override
public boolean nextPosition() throws IOException {
OffsetsEnum formerHeadOE = pendingQueue.poll(); // removes the head
if (formerHeadOE instanceof CachedOE) {
// we're done with the former head. CachedOE's are one use only.
// Look at the new head...
OffsetsEnum newHeadOE = pendingQueue.peek();
if (newHeadOE instanceof OfMatchesIterator) {
// We found the matchesIterator. Requires processing.
nextWhenMatchesIterator((OfMatchesIterator) newHeadOE); // May or may not remove or re-queue itself
} // else new head is a CacheOE or no more. Nothing to do with it.
} else { // formerHeadOE is OfMatchesIterator; advance it
OfMatchesIterator miOE = (OfMatchesIterator) formerHeadOE;
if (miOE.nextPosition()) {
nextWhenMatchesIterator(miOE); // requires processing. May or may not re-enqueue itself
}
}
return pendingQueue.isEmpty() == false;
}
private void nextWhenMatchesIterator(OfMatchesIterator miOE) throws IOException {
boolean isHead = miOE == pendingQueue.peek();
MatchesIterator subMatches = miOE.matchesIterator.getSubMatches();
if (subMatches != null) {
// remove this miOE from the queue, add it's submatches, next() it, then re-enqueue it
if (isHead) {
pendingQueue.poll(); // remove
}
enqueueCachedMatches(subMatches);
if (miOE.nextPosition()) {
pendingQueue.add(miOE);
assert pendingQueue.peek() != miOE; // miOE should follow cached entries
}
} else { // else has no subMatches. It will stay enqueued.
if (!isHead) {
pendingQueue.add(miOE);
} // else it's *already* in pendingQueue
}
}
private boolean enqueueCachedMatches(MatchesIterator thisMI) throws IOException {
if (thisMI == null) {
return false;
} else {
while (thisMI.next()) {
if (false == enqueueCachedMatches(thisMI.getSubMatches())) { // recursion
// if no sub-matches then add ourselves
pendingQueue.add(new CachedOE(queryToTerm(thisMI.getQuery()), thisMI.startOffset(), thisMI.endOffset()));
}
}
return true;
}
}
/** Maps a Query from {@link MatchesIterator#getQuery()} to {@link OffsetsEnum#getTerm()}.
* See {@link Passage#getMatchTerms()}. */
private BytesRef queryToTerm(Query query) {
// compute an approximate BytesRef term of a Query. We cache this since we're likely to see the same query again.
// Our approach is to visit each matching term in order, concatenating them with an adjoining space.
// If we don't have any (perhaps due to an MTQ like a wildcard) then we fall back on the toString() of the query.
return queryToTermMap.computeIfAbsent(query, (Query q) -> {
BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
q.visit(new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
for (Term term : terms) {
if (bytesRefBuilder.length() > 0) {
bytesRefBuilder.append((byte) ' ');
}
bytesRefBuilder.append(term.bytes());
}
}
});
if (bytesRefBuilder.length() > 0) {
return bytesRefBuilder.get();
}
// fallback: (likely a MultiTermQuery)
return new BytesRef(q.toString());
});
}
@Override
public int freq() throws IOException {
return pendingQueue.peek().freq();
}
@Override
public BytesRef getTerm() throws IOException {
return pendingQueue.peek().getTerm();
}
@Override
public int startOffset() throws IOException {
return pendingQueue.peek().startOffset();
}
@Override
public int endOffset() throws IOException {
return pendingQueue.peek().endOffset();
}
private static class CachedOE extends OffsetsEnum {
final BytesRef term;
final int startOffset;
final int endOffset;
private CachedOE(BytesRef term, int startOffset, int endOffset) {
this.term = term;
this.startOffset = startOffset;
this.endOffset = endOffset;
}
@Override
public boolean nextPosition() throws IOException {
return false;
}
@Override
public int freq() throws IOException {
return 1; // documented short-coming of MatchesIterator based UnifiedHighlighter
}
@Override
public BytesRef getTerm() throws IOException {
return term;
}
@Override
public int startOffset() throws IOException {
return startOffset;
}
@Override
public int endOffset() throws IOException {
return endOffset;
}
}
}
/** Based on a {@link MatchesIterator}; does not look at submatches. */
public static class OfMatchesIterator extends OffsetsEnum {
private final MatchesIterator matchesIterator;
private final Supplier<BytesRef> termSupplier;
public OfMatchesIterator(MatchesIterator matchesIterator, Supplier<BytesRef> termSupplier) {
this.matchesIterator = matchesIterator;
this.termSupplier = termSupplier;
}
@Override
public boolean nextPosition() throws IOException {
return matchesIterator.next();
}
@Override
public int freq() throws IOException {
return 1; // documented short-coming of MatchesIterator based UnifiedHighlighter
}
@Override
public BytesRef getTerm() throws IOException {
return termSupplier.get();
}
@Override
public int startOffset() throws IOException {
return matchesIterator.startOffset();
}
@Override
public int endOffset() throws IOException {
return matchesIterator.endOffset();
}
}
/**
* Empty enumeration
*/
public static final OffsetsEnum EMPTY = new OffsetsEnum() {
@Override
public boolean nextPosition() throws IOException {
return false;
}
@Override
public BytesRef getTerm() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int freq() throws IOException {
return 0;
}
};
/**
* A view over several OffsetsEnum instances, merging them in-place
*/
//If OffsetsEnum and MatchesIterator ever truly merge then this could go away in lieu of DisjunctionMatchesIterator
public static class MultiOffsetsEnum extends OffsetsEnum {
private final PriorityQueue<OffsetsEnum> queue;
private boolean started = false;
public MultiOffsetsEnum(List<OffsetsEnum> inner) throws IOException {
this.queue = new PriorityQueue<>();
for (OffsetsEnum oe : inner) {
if (oe.nextPosition())
this.queue.add(oe);
}
}
@Override
public boolean nextPosition() throws IOException {
if (started == false) {
started = true;
return this.queue.size() > 0;
}
if (this.queue.size() > 0) {
OffsetsEnum top = this.queue.poll();
if (top.nextPosition()) {
this.queue.add(top);
return true;
}
else {
top.close();
}
return this.queue.size() > 0;
}
return false;
}
@Override
public BytesRef getTerm() throws IOException {
return this.queue.peek().getTerm();
}
@Override
public int startOffset() throws IOException {
return this.queue.peek().startOffset();
}
@Override
public int endOffset() throws IOException {
return this.queue.peek().endOffset();
}
@Override
public int freq() throws IOException {
return this.queue.peek().freq();
}
@Override
public void close() throws IOException {
// most child enums will have been closed in .nextPosition()
// here all remaining non-exhausted enums are closed
IOUtils.close(queue);
}
}
}