blob: 57eb42f69210d6f25e82d52bf8c2e984afb20954 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;
/**
* Exposes {@link TermsEnum} API, merged from {@link TermsEnum} API of sub-segments.
* This does a merge sort, by term text, of the sub-readers.
*
* @lucene.experimental
*/
public final class MultiTermsEnum extends BaseTermsEnum {
private static final Comparator<TermsEnumWithSlice> INDEX_COMPARATOR = new Comparator<TermsEnumWithSlice>() {
@Override
public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) {
return o1.index - o2.index;
}
};
private final TermMergeQueue queue;
private final TermsEnumWithSlice[] subs; // all of our subs (one per sub-reader)
private final TermsEnumWithSlice[] currentSubs; // current subs that have at least one term for this field
private final TermsEnumWithSlice[] top;
private final MultiPostingsEnum.EnumWithSlice[] subDocs;
private BytesRef lastSeek;
private boolean lastSeekExact;
private final BytesRefBuilder lastSeekScratch = new BytesRefBuilder();
private int numTop;
private int numSubs;
private BytesRef current;
static class TermsEnumIndex {
public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0];
final int subIndex;
final TermsEnum termsEnum;
public TermsEnumIndex(TermsEnum termsEnum, int subIndex) {
this.termsEnum = termsEnum;
this.subIndex = subIndex;
}
}
/** Returns how many sub-reader slices contain the current
* term. @see #getMatchArray */
public int getMatchCount() {
return numTop;
}
/** Returns sub-reader slices positioned to the current term. */
public TermsEnumWithSlice[] getMatchArray() {
return top;
}
/** Sole constructor.
* @param slices Which sub-reader slices we should
* merge. */
public MultiTermsEnum(ReaderSlice[] slices) {
queue = new TermMergeQueue(slices.length);
top = new TermsEnumWithSlice[slices.length];
subs = new TermsEnumWithSlice[slices.length];
subDocs = new MultiPostingsEnum.EnumWithSlice[slices.length];
for(int i=0;i<slices.length;i++) {
subs[i] = new TermsEnumWithSlice(i, slices[i]);
subDocs[i] = new MultiPostingsEnum.EnumWithSlice();
subDocs[i].slice = slices[i];
}
currentSubs = new TermsEnumWithSlice[slices.length];
}
@Override
public BytesRef term() {
return current;
}
/** The terms array must be newly created TermsEnum, ie
* {@link TermsEnum#next} has not yet been called. */
public TermsEnum reset(TermsEnumIndex[] termsEnumsIndex) throws IOException {
assert termsEnumsIndex.length <= top.length;
numSubs = 0;
numTop = 0;
queue.clear();
for(int i=0;i<termsEnumsIndex.length;i++) {
final TermsEnumIndex termsEnumIndex = termsEnumsIndex[i];
assert termsEnumIndex != null;
final BytesRef term = termsEnumIndex.termsEnum.next();
if (term != null) {
final TermsEnumWithSlice entry = subs[termsEnumIndex.subIndex];
entry.reset(termsEnumIndex.termsEnum, term);
queue.add(entry);
currentSubs[numSubs++] = entry;
} else {
// field has no terms
}
}
if (queue.size() == 0) {
return TermsEnum.EMPTY;
} else {
return this;
}
}
@Override
public boolean seekExact(BytesRef term) throws IOException {
queue.clear();
numTop = 0;
boolean seekOpt = false;
if (lastSeek != null && lastSeek.compareTo(term) <= 0) {
seekOpt = true;
}
lastSeek = null;
lastSeekExact = true;
for(int i=0;i<numSubs;i++) {
final boolean status;
// LUCENE-2130: if we had just seek'd already, prior
// to this seek, and the new seek term is after the
// previous one, don't try to re-seek this sub if its
// current term is already beyond this new seek term.
// Doing so is a waste because this sub will simply
// seek to the same spot.
if (seekOpt) {
final BytesRef curTerm = currentSubs[i].current;
if (curTerm != null) {
final int cmp = term.compareTo(curTerm);
if (cmp == 0) {
status = true;
} else if (cmp < 0) {
status = false;
} else {
status = currentSubs[i].terms.seekExact(term);
}
} else {
status = false;
}
} else {
status = currentSubs[i].terms.seekExact(term);
}
if (status) {
top[numTop++] = currentSubs[i];
current = currentSubs[i].current = currentSubs[i].terms.term();
assert term.equals(currentSubs[i].current);
}
}
// if at least one sub had exact match to the requested
// term then we found match
return numTop > 0;
}
@Override
public SeekStatus seekCeil(BytesRef term) throws IOException {
queue.clear();
numTop = 0;
lastSeekExact = false;
boolean seekOpt = false;
if (lastSeek != null && lastSeek.compareTo(term) <= 0) {
seekOpt = true;
}
lastSeekScratch.copyBytes(term);
lastSeek = lastSeekScratch.get();
for(int i=0;i<numSubs;i++) {
final SeekStatus status;
// LUCENE-2130: if we had just seek'd already, prior
// to this seek, and the new seek term is after the
// previous one, don't try to re-seek this sub if its
// current term is already beyond this new seek term.
// Doing so is a waste because this sub will simply
// seek to the same spot.
if (seekOpt) {
final BytesRef curTerm = currentSubs[i].current;
if (curTerm != null) {
final int cmp = term.compareTo(curTerm);
if (cmp == 0) {
status = SeekStatus.FOUND;
} else if (cmp < 0) {
status = SeekStatus.NOT_FOUND;
} else {
status = currentSubs[i].terms.seekCeil(term);
}
} else {
status = SeekStatus.END;
}
} else {
status = currentSubs[i].terms.seekCeil(term);
}
if (status == SeekStatus.FOUND) {
top[numTop++] = currentSubs[i];
current = currentSubs[i].current = currentSubs[i].terms.term();
queue.add(currentSubs[i]);
} else {
if (status == SeekStatus.NOT_FOUND) {
currentSubs[i].current = currentSubs[i].terms.term();
assert currentSubs[i].current != null;
queue.add(currentSubs[i]);
} else {
assert status == SeekStatus.END;
// enum exhausted
currentSubs[i].current = null;
}
}
}
if (numTop > 0) {
// at least one sub had exact match to the requested term
return SeekStatus.FOUND;
} else if (queue.size() > 0) {
// no sub had exact match, but at least one sub found
// a term after the requested term -- advance to that
// next term:
pullTop();
return SeekStatus.NOT_FOUND;
} else {
return SeekStatus.END;
}
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
private void pullTop() {
// extract all subs from the queue that have the same
// top term
assert numTop == 0;
numTop = queue.fillTop(top);
current = top[0].current;
}
private void pushTop() throws IOException {
// call next() on each top, and reorder queue
for (int i = 0; i < numTop; i++) {
TermsEnumWithSlice top = queue.top();
top.current = top.terms.next();
if (top.current == null) {
queue.pop();
} else {
queue.updateTop();
}
}
numTop = 0;
}
@Override
public BytesRef next() throws IOException {
if (lastSeekExact) {
// Must seekCeil at this point, so those subs that
// didn't have the term can find the following term.
// NOTE: we could save some CPU by only seekCeil the
// subs that didn't match the last exact seek... but
// most impls short-circuit if you seekCeil to term
// they are already on.
final SeekStatus status = seekCeil(current);
assert status == SeekStatus.FOUND;
lastSeekExact = false;
}
lastSeek = null;
// restore queue
pushTop();
// gather equal top fields
if (queue.size() > 0) {
// TODO: we could maybe defer this somewhat costly operation until one of the APIs that
// needs to see the top is invoked (docFreq, postings, etc.)
pullTop();
} else {
current = null;
}
return current;
}
@Override
public int docFreq() throws IOException {
int sum = 0;
for(int i=0;i<numTop;i++) {
sum += top[i].terms.docFreq();
}
return sum;
}
@Override
public long totalTermFreq() throws IOException {
long sum = 0;
for(int i=0;i<numTop;i++) {
final long v = top[i].terms.totalTermFreq();
assert v != -1;
sum += v;
}
return sum;
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
MultiPostingsEnum docsEnum;
// Can only reuse if incoming enum is also a MultiDocsEnum
if (reuse != null && reuse instanceof MultiPostingsEnum) {
docsEnum = (MultiPostingsEnum) reuse;
// ... and was previously created w/ this MultiTermsEnum:
if (!docsEnum.canReuse(this)) {
docsEnum = new MultiPostingsEnum(this, subs.length);
}
} else {
docsEnum = new MultiPostingsEnum(this, subs.length);
}
int upto = 0;
ArrayUtil.timSort(top, 0, numTop, INDEX_COMPARATOR);
for(int i=0;i<numTop;i++) {
final TermsEnumWithSlice entry = top[i];
assert entry.index < docsEnum.subPostingsEnums.length: entry.index + " vs " + docsEnum.subPostingsEnums.length + "; " + subs.length;
final PostingsEnum subPostingsEnum = entry.terms.postings(docsEnum.subPostingsEnums[entry.index], flags);
assert subPostingsEnum != null;
docsEnum.subPostingsEnums[entry.index] = subPostingsEnum;
subDocs[upto].postingsEnum = subPostingsEnum;
subDocs[upto].slice = entry.subSlice;
upto++;
}
return docsEnum.reset(subDocs, upto);
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
// implemented to not fail CheckIndex, but you shouldn't be using impacts on a slow reader
return new SlowImpactsEnum(postings(null, flags));
}
final static class TermsEnumWithSlice {
private final ReaderSlice subSlice;
TermsEnum terms;
public BytesRef current;
final int index;
public TermsEnumWithSlice(int index, ReaderSlice subSlice) {
this.subSlice = subSlice;
this.index = index;
assert subSlice.length >= 0: "length=" + subSlice.length;
}
public void reset(TermsEnum terms, BytesRef term) {
this.terms = terms;
current = term;
}
@Override
public String toString() {
return subSlice.toString()+":"+terms;
}
}
private final static class TermMergeQueue extends PriorityQueue<TermsEnumWithSlice> {
final int[] stack;
TermMergeQueue(int size) {
super(size);
this.stack = new int[size];
}
@Override
protected boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) {
return termsA.current.compareTo(termsB.current) < 0;
}
/** Add the {@link #top()} slice as well as all slices that are positionned
* on the same term to {@code tops} and return how many of them there are. */
int fillTop(TermsEnumWithSlice[] tops) {
final int size = size();
if (size == 0) {
return 0;
}
tops[0] = top();
int numTop = 1;
stack[0] = 1;
int stackLen = 1;
while (stackLen != 0) {
final int index = stack[--stackLen];
final int leftChild = index << 1;
for (int child = leftChild, end = Math.min(size, leftChild + 1); child <= end; ++child) {
TermsEnumWithSlice te = get(child);
if (te.current.equals(tops[0].current)) {
tops[numTop++] = te;
stack[stackLen++] = child;
}
}
}
return numTop;
}
private TermsEnumWithSlice get(int i) {
return (TermsEnumWithSlice) getHeapArray()[i];
}
}
@Override
public String toString() {
return "MultiTermsEnum(" + Arrays.toString(subs) + ")";
}
}