blob: a13522ba2186a3cce96ce7c36755f5eb17ffb357 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.memory;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
// TODO:
// - build depth-N prefix hash?
// - or: longer dense skip lists than just next byte?
/**
* Wraps {@link Lucene90PostingsFormat} format for on-disk storage, but then at read time loads and
* stores all terms and postings directly in RAM as byte[], int[].
*
* <p><b>WARNING</b>: This is exceptionally RAM intensive: it makes no effort to compress the
* postings data, storing terms as separate byte[] and postings as separate int[], but as a result
* it gives substantial increase in search performance.
*
* <p>This postings format supports {@link TermsEnum#ord} and {@link TermsEnum#seekExact(long)}.
*
* <p>Because this holds all term bytes as a single byte[], you cannot have more than 2.1GB worth of
* term bytes in a single segment.
*
* @lucene.experimental
*/
public final class DirectPostingsFormat extends PostingsFormat {
private final int minSkipCount;
private final int lowFreqCutoff;
private static final int DEFAULT_MIN_SKIP_COUNT = 8;
private static final int DEFAULT_LOW_FREQ_CUTOFF = 32;
// private static final boolean DEBUG = true;
// TODO: allow passing/wrapping arbitrary postings format?
public DirectPostingsFormat() {
this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF);
}
/**
* minSkipCount is how many terms in a row must have the same prefix before we put a skip pointer
* down. Terms with docFreq &lt;= lowFreqCutoff will use a single int[] to hold all docs, freqs,
* position and offsets; terms with higher docFreq will use separate arrays.
*/
public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) {
super("Direct");
this.minSkipCount = minSkipCount;
this.lowFreqCutoff = lowFreqCutoff;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene90").fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene90").fieldsProducer(state);
if (state.context.context != IOContext.Context.MERGE) {
FieldsProducer loadedPostings;
try {
postings.checkIntegrity();
loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff);
} finally {
postings.close();
}
return loadedPostings;
} else {
// Don't load postings for merge:
return postings;
}
}
private static final class DirectFields extends FieldsProducer {
private final Map<String, DirectField> fields = new TreeMap<>();
public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff)
throws IOException {
for (String field : fields) {
this.fields.put(
field, new DirectField(state, field, fields.terms(field), minSkipCount, lowFreqCutoff));
}
}
@Override
public Iterator<String> iterator() {
return Collections.unmodifiableSet(fields.keySet()).iterator();
}
@Override
public Terms terms(String field) {
return fields.get(field);
}
@Override
public int size() {
return fields.size();
}
@Override
public void close() {}
@Override
public long ramBytesUsed() {
long sizeInBytes = 0;
for (Map.Entry<String, DirectField> entry : fields.entrySet()) {
sizeInBytes += entry.getKey().length() * Character.BYTES;
sizeInBytes += entry.getValue().ramBytesUsed();
}
return sizeInBytes;
}
@Override
public Collection<Accountable> getChildResources() {
return Accountables.namedAccountables("field", fields);
}
@Override
public void checkIntegrity() throws IOException {
// if we read entirely into ram, we already validated.
// otherwise returned the raw postings reader
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + fields.size() + ")";
}
}
private static final class DirectField extends Terms implements Accountable {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(DirectField.class);
private abstract static class TermAndSkip implements Accountable {
public int[] skips;
}
private static final class LowFreqTerm extends TermAndSkip {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);
public final int[] postings;
public final byte[] payloads;
public final int docFreq;
public final int totalTermFreq;
public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFreq) {
this.postings = postings;
this.payloads = payloads;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED
+ ((postings != null) ? RamUsageEstimator.sizeOf(postings) : 0)
+ ((payloads != null) ? RamUsageEstimator.sizeOf(payloads) : 0);
}
}
// TODO: maybe specialize into prx/no-prx/no-frq cases?
private static final class HighFreqTerm extends TermAndSkip {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);
public final long totalTermFreq;
public final int[] docIDs;
public final int[] freqs;
public final int[][] positions;
public final byte[][][] payloads;
public HighFreqTerm(
int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads, long totalTermFreq) {
this.docIDs = docIDs;
this.freqs = freqs;
this.positions = positions;
this.payloads = payloads;
this.totalTermFreq = totalTermFreq;
}
@Override
public long ramBytesUsed() {
long sizeInBytes = BASE_RAM_BYTES_USED;
sizeInBytes += (docIDs != null) ? RamUsageEstimator.sizeOf(docIDs) : 0;
sizeInBytes += (freqs != null) ? RamUsageEstimator.sizeOf(freqs) : 0;
if (positions != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(positions);
for (int[] position : positions) {
sizeInBytes += (position != null) ? RamUsageEstimator.sizeOf(position) : 0;
}
}
if (payloads != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(payloads);
for (byte[][] payload : payloads) {
if (payload != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(payload);
for (byte[] pload : payload) {
sizeInBytes += (pload != null) ? RamUsageEstimator.sizeOf(pload) : 0;
}
}
}
}
return sizeInBytes;
}
}
private final byte[] termBytes;
private final int[] termOffsets;
private final int[] skips;
private final int[] skipOffsets;
private final TermAndSkip[] terms;
private final boolean hasFreq;
private final boolean hasPos;
private final boolean hasOffsets;
private final boolean hasPayloads;
private final long sumTotalTermFreq;
private final int docCount;
private final long sumDocFreq;
private int skipCount;
// TODO: maybe make a separate builder? These are only
// used during load:
private int count;
private int[] sameCounts = new int[10];
private final int minSkipCount;
private static final class IntArrayWriter {
private int[] ints = new int[10];
private int upto;
public void add(int value) {
if (ints.length == upto) {
ints = ArrayUtil.grow(ints);
}
ints[upto++] = value;
}
public int[] get() {
final int[] arr = new int[upto];
System.arraycopy(ints, 0, arr, 0, upto);
upto = 0;
return arr;
}
}
public DirectField(
SegmentReadState state, String field, Terms termsIn, int minSkipCount, int lowFreqCutoff)
throws IOException {
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
sumTotalTermFreq = termsIn.getSumTotalTermFreq();
sumDocFreq = termsIn.getSumDocFreq();
docCount = termsIn.getDocCount();
final int numTerms = (int) termsIn.size();
if (numTerms == -1) {
throw new IllegalArgumentException("codec does not provide Terms.size()");
}
terms = new TermAndSkip[numTerms];
termOffsets = new int[1 + numTerms];
byte[] termBytes = new byte[1024];
this.minSkipCount = minSkipCount;
hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS) > 0;
hasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0;
hasOffsets =
fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0;
hasPayloads = fieldInfo.hasPayloads();
BytesRef term;
PostingsEnum postingsEnum = null;
PostingsEnum docsAndPositionsEnum = null;
final TermsEnum termsEnum = termsIn.iterator();
int termOffset = 0;
final IntArrayWriter scratch = new IntArrayWriter();
// Used for payloads, if any:
final ByteBuffersDataOutput ros = ByteBuffersDataOutput.newResettableInstance();
// if (DEBUG) {
// System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + "
// hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" +
// hasPayloads);
// }
while ((term = termsEnum.next()) != null) {
final int docFreq = termsEnum.docFreq();
final long totalTermFreq = termsEnum.totalTermFreq();
// if (DEBUG) {
// System.out.println(" term=" + term.utf8ToString());
// }
termOffsets[count] = termOffset;
if (termBytes.length < (termOffset + term.length)) {
termBytes = ArrayUtil.grow(termBytes, termOffset + term.length);
}
System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length);
termOffset += term.length;
termOffsets[count + 1] = termOffset;
if (hasPos) {
docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
} else {
postingsEnum = termsEnum.postings(postingsEnum);
}
final TermAndSkip ent;
final PostingsEnum postingsEnum2;
if (hasPos) {
postingsEnum2 = docsAndPositionsEnum;
} else {
postingsEnum2 = postingsEnum;
}
int docID;
if (docFreq <= lowFreqCutoff) {
ros.reset();
// Pack postings for low-freq terms into a single int[]:
while ((docID = postingsEnum2.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
scratch.add(docID);
if (hasFreq) {
final int freq = postingsEnum2.freq();
scratch.add(freq);
if (hasPos) {
for (int pos = 0; pos < freq; pos++) {
scratch.add(docsAndPositionsEnum.nextPosition());
if (hasOffsets) {
scratch.add(docsAndPositionsEnum.startOffset());
scratch.add(docsAndPositionsEnum.endOffset());
}
if (hasPayloads) {
final BytesRef payload = docsAndPositionsEnum.getPayload();
if (payload != null) {
scratch.add(payload.length);
ros.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
scratch.add(0);
}
}
}
}
}
}
final byte[] payloads = hasPayloads ? ros.toArrayCopy() : null;
final int[] postings = scratch.get();
ent = new LowFreqTerm(postings, payloads, docFreq, (int) totalTermFreq);
} else {
final int[] docs = new int[docFreq];
final int[] freqs;
final int[][] positions;
final byte[][][] payloads;
if (hasFreq) {
freqs = new int[docFreq];
if (hasPos) {
positions = new int[docFreq][];
if (hasPayloads) {
payloads = new byte[docFreq][][];
} else {
payloads = null;
}
} else {
positions = null;
payloads = null;
}
} else {
freqs = null;
positions = null;
payloads = null;
}
// Use separate int[] for the postings for high-freq
// terms:
int upto = 0;
while ((docID = postingsEnum2.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
docs[upto] = docID;
if (hasFreq) {
final int freq = postingsEnum2.freq();
freqs[upto] = freq;
if (hasPos) {
final int mult;
if (hasOffsets) {
mult = 3;
} else {
mult = 1;
}
if (hasPayloads) {
payloads[upto] = new byte[freq][];
}
positions[upto] = new int[mult * freq];
int posUpto = 0;
for (int pos = 0; pos < freq; pos++) {
positions[upto][posUpto] = docsAndPositionsEnum.nextPosition();
if (hasPayloads) {
BytesRef payload = docsAndPositionsEnum.getPayload();
if (payload != null) {
byte[] payloadBytes = new byte[payload.length];
System.arraycopy(
payload.bytes, payload.offset, payloadBytes, 0, payload.length);
payloads[upto][pos] = payloadBytes;
}
}
posUpto++;
if (hasOffsets) {
positions[upto][posUpto++] = docsAndPositionsEnum.startOffset();
positions[upto][posUpto++] = docsAndPositionsEnum.endOffset();
}
}
}
}
upto++;
}
assert upto == docFreq;
ent = new HighFreqTerm(docs, freqs, positions, payloads, totalTermFreq);
}
terms[count] = ent;
setSkips(count, termBytes);
count++;
}
// End sentinel:
termOffsets[count] = termOffset;
finishSkips();
// System.out.println(skipCount + " skips: " + field);
this.termBytes = new byte[termOffset];
System.arraycopy(termBytes, 0, this.termBytes, 0, termOffset);
// Pack skips:
this.skips = new int[skipCount];
this.skipOffsets = new int[1 + numTerms];
int skipOffset = 0;
for (int i = 0; i < numTerms; i++) {
final int[] termSkips = terms[i].skips;
skipOffsets[i] = skipOffset;
if (termSkips != null) {
System.arraycopy(termSkips, 0, skips, skipOffset, termSkips.length);
skipOffset += termSkips.length;
terms[i].skips = null;
}
}
this.skipOffsets[numTerms] = skipOffset;
assert skipOffset == skipCount;
}
@Override
public long ramBytesUsed() {
long sizeInBytes = BASE_RAM_BYTES_USED;
sizeInBytes += ((termBytes != null) ? RamUsageEstimator.sizeOf(termBytes) : 0);
sizeInBytes += ((termOffsets != null) ? RamUsageEstimator.sizeOf(termOffsets) : 0);
sizeInBytes += ((skips != null) ? RamUsageEstimator.sizeOf(skips) : 0);
sizeInBytes += ((skipOffsets != null) ? RamUsageEstimator.sizeOf(skipOffsets) : 0);
sizeInBytes += ((sameCounts != null) ? RamUsageEstimator.sizeOf(sameCounts) : 0);
if (terms != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(terms);
for (TermAndSkip termAndSkip : terms) {
sizeInBytes += (termAndSkip != null) ? termAndSkip.ramBytesUsed() : 0;
}
}
return sizeInBytes;
}
@Override
public String toString() {
return "DirectTerms(terms="
+ terms.length
+ ",postings="
+ sumDocFreq
+ ",positions="
+ sumTotalTermFreq
+ ",docs="
+ docCount
+ ")";
}
// Compares in unicode (UTF8) order:
int compare(int ord, BytesRef other) {
final byte[] otherBytes = other.bytes;
int upto = termOffsets[ord];
final int termLen = termOffsets[1 + ord] - upto;
int otherUpto = other.offset;
final int stop = upto + Math.min(termLen, other.length);
while (upto < stop) {
int diff = (termBytes[upto++] & 0xFF) - (otherBytes[otherUpto++] & 0xFF);
if (diff != 0) {
return diff;
}
}
// One is a prefix of the other, or, they are equal:
return termLen - other.length;
}
private void setSkips(int termOrd, byte[] termBytes) {
final int termLength = termOffsets[termOrd + 1] - termOffsets[termOrd];
if (sameCounts.length < termLength) {
sameCounts = ArrayUtil.grow(sameCounts, termLength);
}
// Update skip pointers:
if (termOrd > 0) {
final int lastTermLength = termOffsets[termOrd] - termOffsets[termOrd - 1];
final int limit = Math.min(termLength, lastTermLength);
int lastTermOffset = termOffsets[termOrd - 1];
int termOffset = termOffsets[termOrd];
int i = 0;
for (; i < limit; i++) {
if (termBytes[lastTermOffset++] == termBytes[termOffset++]) {
sameCounts[i]++;
} else {
for (; i < limit; i++) {
if (sameCounts[i] >= minSkipCount) {
// Go back and add a skip pointer:
saveSkip(termOrd, sameCounts[i]);
}
sameCounts[i] = 1;
}
break;
}
}
for (; i < lastTermLength; i++) {
if (sameCounts[i] >= minSkipCount) {
// Go back and add a skip pointer:
saveSkip(termOrd, sameCounts[i]);
}
sameCounts[i] = 0;
}
for (int j = limit; j < termLength; j++) {
sameCounts[j] = 1;
}
} else {
for (int i = 0; i < termLength; i++) {
sameCounts[i]++;
}
}
}
private void finishSkips() {
assert count == terms.length;
int lastTermOffset = termOffsets[count - 1];
int lastTermLength = termOffsets[count] - lastTermOffset;
for (int i = 0; i < lastTermLength; i++) {
if (sameCounts[i] >= minSkipCount) {
// Go back and add a skip pointer:
saveSkip(count, sameCounts[i]);
}
}
// Reverse the skip pointers so they are "nested":
for (int termID = 0; termID < terms.length; termID++) {
TermAndSkip term = terms[termID];
if (term.skips != null && term.skips.length > 1) {
for (int pos = 0; pos < term.skips.length / 2; pos++) {
final int otherPos = term.skips.length - pos - 1;
final int temp = term.skips[pos];
term.skips[pos] = term.skips[otherPos];
term.skips[otherPos] = temp;
}
}
}
}
private void saveSkip(int ord, int backCount) {
final TermAndSkip term = terms[ord - backCount];
skipCount++;
if (term.skips == null) {
term.skips = new int[] {ord};
} else {
// Normally we'd grow at a slight exponential... but
// given that the skips themselves are already log(N)
// we can grow by only 1 and still have amortized
// linear time:
final int[] newSkips = new int[term.skips.length + 1];
System.arraycopy(term.skips, 0, newSkips, 0, term.skips.length);
term.skips = newSkips;
term.skips[term.skips.length - 1] = ord;
}
}
@Override
public TermsEnum iterator() {
return new DirectTermsEnum();
}
@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new DirectIntersectTermsEnum(compiled, startTerm);
}
@Override
public long size() {
return terms.length;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() {
return sumDocFreq;
}
@Override
public int getDocCount() {
return docCount;
}
@Override
public boolean hasFreqs() {
return hasFreq;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPos;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
private final class DirectTermsEnum extends BaseTermsEnum {
private final BytesRef scratch = new BytesRef();
private int termOrd;
private DirectTermsEnum() {
termOrd = -1;
}
private BytesRef setTerm() {
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[termOrd + 1] - termOffsets[termOrd];
return scratch;
}
@Override
public BytesRef next() {
termOrd++;
if (termOrd < terms.length) {
return setTerm();
} else {
return null;
}
}
@Override
public TermState termState() {
OrdTermState state = new OrdTermState();
state.ord = termOrd;
return state;
}
// If non-negative, exact match; else, -ord-1, where ord
// is where you would insert the term.
private int findTerm(BytesRef term) {
// Just do binary search: should be (constant factor)
// faster than using the skip list:
int low = 0;
int high = terms.length - 1;
while (low <= high) {
int mid = (low + high) >>> 1;
int cmp = compare(mid, term);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid; // key found
}
}
return -(low + 1); // key not found.
}
@Override
public SeekStatus seekCeil(BytesRef term) {
// TODO: we should use the skip pointers; should be
// faster than bin search; we should also hold
// & reuse current state so seeking forwards is
// faster
final int ord = findTerm(term);
// if (DEBUG) {
// System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord);
// }
if (ord >= 0) {
termOrd = ord;
setTerm();
return SeekStatus.FOUND;
} else if (ord == -terms.length - 1) {
return SeekStatus.END;
} else {
termOrd = -ord - 1;
setTerm();
return SeekStatus.NOT_FOUND;
}
}
@Override
public boolean seekExact(BytesRef term) {
// TODO: we should use the skip pointers; should be
// faster than bin search; we should also hold
// & reuse current state so seeking forwards is
// faster
final int ord = findTerm(term);
if (ord >= 0) {
termOrd = ord;
setTerm();
return true;
} else {
return false;
}
}
@Override
public void seekExact(long ord) {
termOrd = (int) ord;
setTerm();
}
@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
termOrd = (int) ((OrdTermState) state).ord;
setTerm();
assert term.equals(scratch);
}
@Override
public BytesRef term() {
return scratch;
}
@Override
public long ord() {
return termOrd;
}
@Override
public int docFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).docFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).docIDs.length;
}
}
@Override
public long totalTermFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).totalTermFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).totalTermFreq;
}
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
// TODO: implement reuse
// it's hairy!
// TODO: the logic of which enum impl to choose should be refactored to be simpler...
if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
if (terms[termOrd] instanceof LowFreqTerm) {
final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]);
final int[] postings = term.postings;
if (hasFreq == false) {
LowFreqDocsEnumNoTF docsEnum;
if (reuse instanceof LowFreqDocsEnumNoTF) {
docsEnum = (LowFreqDocsEnumNoTF) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoTF();
}
return docsEnum.reset(postings);
} else if (hasPos == false) {
LowFreqDocsEnumNoPos docsEnum;
if (reuse instanceof LowFreqDocsEnumNoPos) {
docsEnum = (LowFreqDocsEnumNoPos) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoPos();
}
return docsEnum.reset(postings);
}
final byte[] payloads = term.payloads;
return new LowFreqPostingsEnum(hasOffsets, hasPayloads).reset(postings, payloads);
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
if (hasPos == false) {
return new HighFreqDocsEnum().reset(term.docIDs, term.freqs);
} else {
return new HighFreqPostingsEnum(hasOffsets)
.reset(term.docIDs, term.freqs, term.positions, term.payloads);
}
}
}
if (terms[termOrd] instanceof LowFreqTerm) {
final int[] postings = ((LowFreqTerm) terms[termOrd]).postings;
if (hasFreq) {
if (hasPos) {
int posLen;
if (hasOffsets) {
posLen = 3;
} else {
posLen = 1;
}
if (hasPayloads) {
posLen++;
}
LowFreqDocsEnum docsEnum;
if (reuse instanceof LowFreqDocsEnum) {
docsEnum = (LowFreqDocsEnum) reuse;
if (!docsEnum.canReuse(posLen)) {
docsEnum = new LowFreqDocsEnum(posLen);
}
} else {
docsEnum = new LowFreqDocsEnum(posLen);
}
return docsEnum.reset(postings);
} else {
LowFreqDocsEnumNoPos docsEnum;
if (reuse instanceof LowFreqDocsEnumNoPos) {
docsEnum = (LowFreqDocsEnumNoPos) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoPos();
}
return docsEnum.reset(postings);
}
} else {
LowFreqDocsEnumNoTF docsEnum;
if (reuse instanceof LowFreqDocsEnumNoTF) {
docsEnum = (LowFreqDocsEnumNoTF) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoTF();
}
return docsEnum.reset(postings);
}
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
HighFreqDocsEnum docsEnum;
if (reuse instanceof HighFreqDocsEnum) {
docsEnum = (HighFreqDocsEnum) reuse;
} else {
docsEnum = new HighFreqDocsEnum();
}
// System.out.println(" DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString()
// + ": " + term.docIDs.length + " docs");
return docsEnum.reset(term.docIDs, term.freqs);
}
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
return new SlowImpactsEnum(postings(null, flags));
}
}
private final class DirectIntersectTermsEnum extends BaseTermsEnum {
private final RunAutomaton runAutomaton;
private final CompiledAutomaton compiledAutomaton;
private int termOrd;
private final BytesRef scratch = new BytesRef();
private final class State {
int changeOrd;
int state;
int transitionUpto;
int transitionCount;
int transitionMax;
int transitionMin;
final Transition transition = new Transition();
}
private State[] states;
private int stateUpto;
public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) {
runAutomaton = compiled.runAutomaton;
compiledAutomaton = compiled;
termOrd = -1;
states = new State[1];
states[0] = new State();
states[0].changeOrd = terms.length;
states[0].state = 0;
states[0].transitionCount = compiledAutomaton.automaton.getNumTransitions(states[0].state);
compiledAutomaton.automaton.initTransition(states[0].state, states[0].transition);
states[0].transitionUpto = -1;
states[0].transitionMax = -1;
// System.out.println("IE.init startTerm=" + startTerm);
if (startTerm != null) {
int skipUpto = 0;
if (startTerm.length == 0) {
if (terms.length > 0 && termOffsets[1] == 0) {
termOrd = 0;
}
} else {
termOrd++;
nextLabel:
for (int i = 0; i < startTerm.length; i++) {
final int label = startTerm.bytes[startTerm.offset + i] & 0xFF;
while (label > states[i].transitionMax) {
states[i].transitionUpto++;
assert states[i].transitionUpto < states[i].transitionCount;
compiledAutomaton.automaton.getNextTransition(states[i].transition);
states[i].transitionMin = states[i].transition.min;
states[i].transitionMax = states[i].transition.max;
assert states[i].transitionMin >= 0;
assert states[i].transitionMin <= 255;
assert states[i].transitionMax >= 0;
assert states[i].transitionMax <= 255;
}
// Skip forwards until we find a term matching
// the label at this position:
while (termOrd < terms.length) {
final int skipOffset = skipOffsets[termOrd];
final int numSkips = skipOffsets[termOrd + 1] - skipOffset;
final int termOffset = termOffsets[termOrd];
final int termLength = termOffsets[1 + termOrd] - termOffset;
// if (DEBUG) {
// System.out.println(" check termOrd=" + termOrd + " term=" + new
// BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" +
// Arrays.toString(skips) + " i=" + i);
// }
if (termOrd == states[stateUpto].changeOrd) {
// if (DEBUG) {
// System.out.println(" end push return");
// }
stateUpto--;
termOrd--;
return;
}
if (termLength == i) {
termOrd++;
skipUpto = 0;
// if (DEBUG) {
// System.out.println(" term too short; next term");
// }
} else if (label < (termBytes[termOffset + i] & 0xFF)) {
termOrd--;
// if (DEBUG) {
// System.out.println(" no match; already beyond; return termOrd=" + termOrd);
// }
stateUpto -= skipUpto;
assert stateUpto >= 0;
return;
} else if (label == (termBytes[termOffset + i] & 0xFF)) {
// if (DEBUG) {
// System.out.println(" label[" + i + "] matches");
// }
if (skipUpto < numSkips) {
grow();
final int nextState = runAutomaton.step(states[stateUpto].state, label);
// Automaton is required to accept startTerm:
assert nextState != -1;
stateUpto++;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].state = nextState;
states[stateUpto].transitionCount =
compiledAutomaton.automaton.getNumTransitions(nextState);
compiledAutomaton.automaton.initTransition(
states[stateUpto].state, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
// System.out.println(" push " + states[stateUpto].transitions.length + "
// trans");
// if (DEBUG) {
// System.out.println(" push skip; changeOrd=" +
// states[stateUpto].changeOrd);
// }
// Match next label at this same term:
continue nextLabel;
} else {
// if (DEBUG) {
// System.out.println(" linear scan");
// }
// Index exhausted: just scan now (the
// number of scans required will be less
// than the minSkipCount):
final int startTermOrd = termOrd;
while (termOrd < terms.length && compare(termOrd, startTerm) <= 0) {
assert termOrd == startTermOrd
|| skipOffsets[termOrd] == skipOffsets[termOrd + 1];
termOrd++;
}
assert termOrd - startTermOrd < minSkipCount;
termOrd--;
stateUpto -= skipUpto;
// if (DEBUG) {
// System.out.println(" end termOrd=" + termOrd);
// }
return;
}
} else {
if (skipUpto < numSkips) {
termOrd = skips[skipOffset + skipUpto];
// if (DEBUG) {
// System.out.println(" no match; skip to termOrd=" + termOrd);
// }
} else {
// if (DEBUG) {
// System.out.println(" no match; next term");
// }
termOrd++;
}
skipUpto = 0;
}
}
// startTerm is >= last term so enum will not
// return any terms:
termOrd--;
// if (DEBUG) {
// System.out.println(" beyond end; no terms will match");
// }
return;
}
}
final int termOffset = termOffsets[termOrd];
final int termLen = termOffsets[1 + termOrd] - termOffset;
if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) {
stateUpto -= skipUpto;
termOrd--;
}
// if (DEBUG) {
// System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" +
// stateUpto);
// }
}
}
private void grow() {
if (states.length == 1 + stateUpto) {
final State[] newStates = new State[states.length + 1];
System.arraycopy(states, 0, newStates, 0, states.length);
newStates[states.length] = new State();
states = newStates;
}
}
@Override
public BytesRef next() {
// if (DEBUG) {
// System.out.println("\nIE.next");
// }
termOrd++;
int skipUpto = 0;
if (termOrd == 0 && termOffsets[1] == 0) {
// Special-case empty string:
assert stateUpto == 0;
// if (DEBUG) {
// System.out.println(" visit empty string");
// }
if (runAutomaton.isAccept(states[0].state)) {
scratch.bytes = termBytes;
scratch.offset = 0;
scratch.length = 0;
return scratch;
}
termOrd++;
}
nextTerm:
while (true) {
// if (DEBUG) {
// System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + "
// skipUpto=" + skipUpto);
// }
if (termOrd == terms.length) {
// if (DEBUG) {
// System.out.println(" return END");
// }
return null;
}
final State state = states[stateUpto];
if (termOrd == state.changeOrd) {
// Pop:
// if (DEBUG) {
// System.out.println(" pop stateUpto=" + stateUpto);
// }
stateUpto--;
/*
if (DEBUG) {
try {
//System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)).utf8ToString());
System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)));
} catch (ArrayIndexOutOfBoundsException aioobe) {
System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)));
}
}
*/
continue;
}
final int termOffset = termOffsets[termOrd];
final int termLength = termOffsets[termOrd + 1] - termOffset;
final int skipOffset = skipOffsets[termOrd];
final int numSkips = skipOffsets[termOrd + 1] - skipOffset;
// if (DEBUG) {
// System.out.println(" term=" + new BytesRef(termBytes, termOffset,
// termLength).utf8ToString() + " skips=" + Arrays.toString(skips));
// }
assert termOrd < state.changeOrd;
assert stateUpto <= termLength : "term.length=" + termLength + "; stateUpto=" + stateUpto;
final int label = termBytes[termOffset + stateUpto] & 0xFF;
while (label > state.transitionMax) {
// System.out.println(" label=" + label + " vs max=" + state.transitionMax + "
// transUpto=" + state.transitionUpto + " vs " + state.transitions.length);
state.transitionUpto++;
if (state.transitionUpto == state.transitionCount) {
// We've exhausted transitions leaving this
// state; force pop+next/skip now:
// System.out.println("forcepop: stateUpto=" + stateUpto);
if (stateUpto == 0) {
termOrd = terms.length;
return null;
} else {
assert state.changeOrd > termOrd;
// if (DEBUG) {
// System.out.println(" jumpend " + (state.changeOrd - termOrd));
// }
// System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " +
// termOrd);
termOrd = states[stateUpto].changeOrd;
skipUpto = 0;
stateUpto--;
}
continue nextTerm;
}
compiledAutomaton.automaton.getNextTransition(state.transition);
assert state.transitionUpto < state.transitionCount
: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitionCount;
state.transitionMin = state.transition.min;
state.transitionMax = state.transition.max;
assert state.transitionMin >= 0;
assert state.transitionMin <= 255;
assert state.transitionMax >= 0;
assert state.transitionMax <= 255;
}
/*
if (DEBUG) {
System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + (char) label + "(" + label + ") term=" + new BytesRef(terms[termOrd].term).utf8ToString() + " trans " +
(char) state.transitionMin + "(" + state.transitionMin + ")" + "-" + (char) state.transitionMax + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips)));
System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + Integer.toHexString(label) + "(" + label + ") term=" + new BytesRef(termBytes, termOffset, termLength) + " trans " +
Integer.toHexString(state.transitionMin) + "(" + state.transitionMin + ")" + "-" + Integer.toHexString(state.transitionMax) + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips)));
}
*/
final int targetLabel = state.transitionMin;
if ((termBytes[termOffset + stateUpto] & 0xFF) < targetLabel) {
// if (DEBUG) {
// System.out.println(" do bin search");
// }
// int startTermOrd = termOrd;
int low = termOrd + 1;
int high = state.changeOrd - 1;
while (true) {
if (low > high) {
// Label not found
termOrd = low;
// if (DEBUG) {
// System.out.println(" advanced by " + (termOrd - startTermOrd));
// }
// System.out.println(" jump " + (termOrd - startTermOrd));
skipUpto = 0;
continue nextTerm;
}
int mid = (low + high) >>> 1;
int cmp = (termBytes[termOffsets[mid] + stateUpto] & 0xFF) - targetLabel;
// if (DEBUG) {
// System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low]
// + stateUpto] & 0xFF) + " ord=" + mid);
// }
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// Label found; walk backwards to first
// occurrence:
while (mid > termOrd
&& (termBytes[termOffsets[mid - 1] + stateUpto] & 0xFF) == targetLabel) {
mid--;
}
termOrd = mid;
// if (DEBUG) {
// System.out.println(" advanced by " + (termOrd - startTermOrd));
// }
// System.out.println(" jump " + (termOrd - startTermOrd));
skipUpto = 0;
continue nextTerm;
}
}
}
int nextState = runAutomaton.step(states[stateUpto].state, label);
if (nextState == -1) {
// Skip
// if (DEBUG) {
// System.out.println(" automaton doesn't accept; skip");
// }
if (skipUpto < numSkips) {
// if (DEBUG) {
// System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd));
// }
termOrd = skips[skipOffset + skipUpto];
} else {
termOrd++;
}
skipUpto = 0;
} else if (skipUpto < numSkips) {
// Push:
// if (DEBUG) {
// System.out.println(" push");
// }
/*
if (DEBUG) {
try {
//System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1).utf8ToString());
System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1));
} catch (ArrayIndexOutOfBoundsException aioobe) {
System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1));
}
}
*/
grow();
stateUpto++;
states[stateUpto].state = nextState;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].transitionCount =
compiledAutomaton.automaton.getNumTransitions(nextState);
compiledAutomaton.automaton.initTransition(nextState, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
if (stateUpto == termLength) {
// if (DEBUG) {
// System.out.println(" term ends after push");
// }
if (runAutomaton.isAccept(nextState)) {
// if (DEBUG) {
// System.out.println(" automaton accepts: return");
// }
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[1 + termOrd] - scratch.offset;
// if (DEBUG) {
// System.out.println(" ret " + scratch.utf8ToString());
// }
return scratch;
} else {
// if (DEBUG) {
// System.out.println(" automaton rejects: nextTerm");
// }
termOrd++;
skipUpto = 0;
}
}
} else {
// Run the non-indexed tail of this term:
// TODO: add assert that we don't inc too many times
if (compiledAutomaton.commonSuffixRef != null) {
// System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString());
assert compiledAutomaton.commonSuffixRef.offset == 0;
if (termLength < compiledAutomaton.commonSuffixRef.length) {
termOrd++;
skipUpto = 0;
continue nextTerm;
}
int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length;
for (int suffix = 0; suffix < compiledAutomaton.commonSuffixRef.length; suffix++) {
if (termBytes[offset + suffix] != compiledAutomaton.commonSuffixRef.bytes[suffix]) {
termOrd++;
skipUpto = 0;
continue nextTerm;
}
}
}
int upto = stateUpto + 1;
while (upto < termLength) {
nextState = runAutomaton.step(nextState, termBytes[termOffset + upto] & 0xFF);
if (nextState == -1) {
termOrd++;
skipUpto = 0;
// if (DEBUG) {
// System.out.println(" nomatch tail; next term");
// }
continue nextTerm;
}
upto++;
}
if (runAutomaton.isAccept(nextState)) {
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[1 + termOrd] - scratch.offset;
// if (DEBUG) {
// System.out.println(" match tail; return " + scratch.utf8ToString());
// System.out.println(" ret2 " + scratch.utf8ToString());
// }
return scratch;
} else {
termOrd++;
skipUpto = 0;
// if (DEBUG) {
// System.out.println(" nomatch tail; next term");
// }
}
}
}
}
@Override
public TermState termState() {
OrdTermState state = new OrdTermState();
state.ord = termOrd;
return state;
}
@Override
public BytesRef term() {
return scratch;
}
@Override
public long ord() {
return termOrd;
}
@Override
public int docFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).docFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).docIDs.length;
}
}
@Override
public long totalTermFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).totalTermFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).totalTermFreq;
}
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) {
// TODO: implement reuse
// it's hairy!
// TODO: the logic of which enum impl to choose should be refactored to be simpler...
if (hasPos && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
if (terms[termOrd] instanceof LowFreqTerm) {
final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]);
final int[] postings = term.postings;
final byte[] payloads = term.payloads;
return new LowFreqPostingsEnum(hasOffsets, hasPayloads).reset(postings, payloads);
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
return new HighFreqPostingsEnum(hasOffsets)
.reset(term.docIDs, term.freqs, term.positions, term.payloads);
}
}
if (terms[termOrd] instanceof LowFreqTerm) {
final int[] postings = ((LowFreqTerm) terms[termOrd]).postings;
if (hasFreq) {
if (hasPos) {
int posLen;
if (hasOffsets) {
posLen = 3;
} else {
posLen = 1;
}
if (hasPayloads) {
posLen++;
}
return new LowFreqDocsEnum(posLen).reset(postings);
} else {
return new LowFreqDocsEnumNoPos().reset(postings);
}
} else {
return new LowFreqDocsEnumNoTF().reset(postings);
}
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
// System.out.println("DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() +
// ": " + term.docIDs.length + " docs");
return new HighFreqDocsEnum().reset(term.docIDs, term.freqs);
}
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
return new SlowImpactsEnum(postings(null, flags));
}
@Override
public SeekStatus seekCeil(BytesRef term) {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
}
}
// Docs only:
private static final class LowFreqDocsEnumNoTF extends PostingsEnum {
private int[] postings;
private int upto;
public PostingsEnum reset(int[] postings) {
this.postings = postings;
upto = -1;
return this;
}
// TODO: can do this w/o setting members?
@Override
public int nextDoc() {
upto++;
if (upto < postings.length) {
return postings[upto];
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
if (upto < 0) {
return -1;
} else if (upto < postings.length) {
return postings[upto];
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
return 1;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int advance(int target) throws IOException {
// Linear scan, but this is low-freq term so it won't
// be costly:
return slowAdvance(target);
}
@Override
public long cost() {
return postings.length;
}
}
// Docs + freqs:
private static final class LowFreqDocsEnumNoPos extends PostingsEnum {
private int[] postings;
private int upto;
public LowFreqDocsEnumNoPos() {}
public PostingsEnum reset(int[] postings) {
this.postings = postings;
upto = -2;
return this;
}
// TODO: can do this w/o setting members?
@Override
public int nextDoc() {
upto += 2;
if (upto < postings.length) {
return postings[upto];
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
if (upto < 0) {
return -1;
} else if (upto < postings.length) {
return postings[upto];
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
return postings[upto + 1];
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int advance(int target) throws IOException {
// Linear scan, but this is low-freq term so it won't
// be costly:
return slowAdvance(target);
}
@Override
public long cost() {
return postings.length / 2;
}
}
// Docs + freqs + positions/offsets:
private static final class LowFreqDocsEnum extends PostingsEnum {
private int[] postings;
private final int posMult;
private int upto;
private int freq;
public LowFreqDocsEnum(int posMult) {
this.posMult = posMult;
// if (DEBUG) {
// System.out.println("LowFreqDE: posMult=" + posMult);
// }
}
public boolean canReuse(int posMult) {
return this.posMult == posMult;
}
public PostingsEnum reset(int[] postings) {
this.postings = postings;
upto = -2;
freq = 0;
return this;
}
// TODO: can do this w/o setting members?
@Override
public int nextDoc() {
upto += 2 + freq * posMult;
// if (DEBUG) {
// System.out.println(" nextDoc freq=" + freq + " upto=" + upto + " vs " +
// postings.length);
// }
if (upto < postings.length) {
freq = postings[upto + 1];
assert freq > 0;
return postings[upto];
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
// TODO: store docID member?
if (upto < 0) {
return -1;
} else if (upto < postings.length) {
return postings[upto];
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
// TODO: can I do postings[upto+1]?
return freq;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int advance(int target) throws IOException {
// Linear scan, but this is low-freq term so it won't
// be costly:
return slowAdvance(target);
}
@Override
public long cost() {
// TODO: could do a better estimate
return postings.length / 2;
}
}
private static final class LowFreqPostingsEnum extends PostingsEnum {
private int[] postings;
private final int posMult;
private final boolean hasOffsets;
private final boolean hasPayloads;
private final BytesRef payload = new BytesRef();
private int upto;
private int docID;
private int freq;
private int skipPositions;
private int pos;
private int startOffset;
private int endOffset;
private int lastPayloadOffset;
private int payloadOffset;
private int payloadLength;
private byte[] payloadBytes;
public LowFreqPostingsEnum(boolean hasOffsets, boolean hasPayloads) {
this.hasOffsets = hasOffsets;
this.hasPayloads = hasPayloads;
if (hasOffsets) {
if (hasPayloads) {
posMult = 4;
} else {
posMult = 3;
}
} else if (hasPayloads) {
posMult = 2;
} else {
posMult = 1;
}
}
public PostingsEnum reset(int[] postings, byte[] payloadBytes) {
this.postings = postings;
upto = 0;
skipPositions = 0;
pos = -1;
startOffset = -1;
endOffset = -1;
docID = -1;
payloadLength = 0;
this.payloadBytes = payloadBytes;
return this;
}
@Override
public int nextDoc() {
pos = -1;
if (hasPayloads) {
for (int i = 0; i < skipPositions; i++) {
upto++;
if (hasOffsets) {
upto += 2;
}
payloadOffset += postings[upto++];
}
} else {
upto += posMult * skipPositions;
}
if (upto < postings.length) {
docID = postings[upto++];
freq = postings[upto++];
skipPositions = freq;
return docID;
}
return docID = NO_MORE_DOCS;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
return freq;
}
@Override
public int nextPosition() {
assert skipPositions > 0;
skipPositions--;
pos = postings[upto++];
if (hasOffsets) {
startOffset = postings[upto++];
endOffset = postings[upto++];
}
if (hasPayloads) {
payloadLength = postings[upto++];
lastPayloadOffset = payloadOffset;
payloadOffset += payloadLength;
}
return pos;
}
@Override
public int startOffset() {
return startOffset;
}
@Override
public int endOffset() {
return endOffset;
}
@Override
public int advance(int target) throws IOException {
return slowAdvance(target);
}
@Override
public BytesRef getPayload() {
if (payloadLength > 0) {
payload.bytes = payloadBytes;
payload.offset = lastPayloadOffset;
payload.length = payloadLength;
return payload;
} else {
return null;
}
}
@Override
public long cost() {
// TODO: could do a better estimate
return postings.length / 2;
}
}
// Docs + freqs:
private static final class HighFreqDocsEnum extends PostingsEnum {
private int[] docIDs;
private int[] freqs;
private int upto;
private int docID = -1;
public HighFreqDocsEnum() {}
public int[] getDocIDs() {
return docIDs;
}
public int[] getFreqs() {
return freqs;
}
public PostingsEnum reset(int[] docIDs, int[] freqs) {
this.docIDs = docIDs;
this.freqs = freqs;
docID = upto = -1;
return this;
}
@Override
public int nextDoc() {
upto++;
try {
return docID = docIDs[upto];
} catch (ArrayIndexOutOfBoundsException e) {
}
return docID = NO_MORE_DOCS;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
if (freqs == null) {
return 1;
} else {
return freqs[upto];
}
}
@Override
public int advance(int target) {
/*
upto++;
if (upto == docIDs.length) {
return docID = NO_MORE_DOCS;
}
final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target);
if (index < 0) {
upto = -index - 1;
} else {
upto = index;
}
if (liveDocs != null) {
while (upto < docIDs.length) {
if (liveDocs.get(docIDs[upto])) {
break;
}
upto++;
}
}
if (upto == docIDs.length) {
return NO_MORE_DOCS;
} else {
return docID = docIDs[upto];
}
*/
// System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + "
// of " + docIDs.length);
// if (DEBUG) {
// System.out.println("advance target=" + target + " len=" + docIDs.length);
// }
upto++;
if (upto == docIDs.length) {
return docID = NO_MORE_DOCS;
}
// First "grow" outwards, since most advances are to
// nearby docs:
int inc = 10;
int nextUpto = upto + 10;
int low;
int high;
while (true) {
// System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc);
if (nextUpto >= docIDs.length) {
low = nextUpto - inc;
high = docIDs.length - 1;
break;
}
// System.out.println(" docID=" + docIDs[nextUpto]);
if (target <= docIDs[nextUpto]) {
low = nextUpto - inc;
high = nextUpto;
break;
}
inc *= 2;
nextUpto += inc;
}
// Now do normal binary search
// System.out.println(" after fwd: low=" + low + " high=" + high);
while (true) {
if (low > high) {
// Not exactly found
// System.out.println(" break: no match");
upto = low;
break;
}
int mid = (low + high) >>> 1;
int cmp = docIDs[mid] - target;
// System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" +
// docIDs[mid]);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// Found target
upto = mid;
// System.out.println(" break: match");
break;
}
}
// System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ?
// NO_MORE_DOCS : docIDs[upto]));
if (upto == docIDs.length) {
// System.out.println(" return END");
return docID = NO_MORE_DOCS;
} else {
// System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto);
return docID = docIDs[upto];
}
}
@Override
public long cost() {
return docIDs.length;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
}
// TODO: specialize offsets and not
private static final class HighFreqPostingsEnum extends PostingsEnum {
private int[] docIDs;
private int[] freqs;
private int[][] positions;
private byte[][][] payloads;
private final boolean hasOffsets;
private final int posJump;
private int upto;
private int docID = -1;
private int posUpto;
private int[] curPositions;
public HighFreqPostingsEnum(boolean hasOffsets) {
this.hasOffsets = hasOffsets;
posJump = hasOffsets ? 3 : 1;
}
public int[] getDocIDs() {
return docIDs;
}
public int[][] getPositions() {
return positions;
}
public int getPosJump() {
return posJump;
}
public PostingsEnum reset(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads) {
this.docIDs = docIDs;
this.freqs = freqs;
this.positions = positions;
this.payloads = payloads;
upto = -1;
return this;
}
@Override
public int nextDoc() {
upto++;
if (upto < docIDs.length) {
posUpto = -posJump;
curPositions = positions[upto];
return docID = docIDs[upto];
}
return docID = NO_MORE_DOCS;
}
@Override
public int freq() {
return freqs[upto];
}
@Override
public int docID() {
return docID;
}
@Override
public int nextPosition() {
posUpto += posJump;
assert posUpto < curPositions.length;
return curPositions[posUpto];
}
@Override
public int startOffset() {
if (hasOffsets) {
return curPositions[posUpto + 1];
} else {
return -1;
}
}
@Override
public int endOffset() {
if (hasOffsets) {
return curPositions[posUpto + 2];
} else {
return -1;
}
}
@Override
public int advance(int target) {
/*
upto++;
if (upto == docIDs.length) {
return NO_MORE_DOCS;
}
final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target);
if (index < 0) {
upto = -index - 1;
} else {
upto = index;
}
if (liveDocs != null) {
while (upto < docIDs.length) {
if (liveDocs.get(docIDs[upto])) {
break;
}
upto++;
}
}
posUpto = hasOffsets ? -3 : -1;
if (upto == docIDs.length) {
return NO_MORE_DOCS;
} else {
return docID();
}
*/
// System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + "
// of " + docIDs.length);
// if (DEBUG) {
// System.out.println("advance target=" + target + " len=" + docIDs.length);
// }
upto++;
if (upto == docIDs.length) {
return docID = NO_MORE_DOCS;
}
// First "grow" outwards, since most advances are to
// nearby docs:
int inc = 10;
int nextUpto = upto + 10;
int low;
int high;
while (true) {
// System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc);
if (nextUpto >= docIDs.length) {
low = nextUpto - inc;
high = docIDs.length - 1;
break;
}
// System.out.println(" docID=" + docIDs[nextUpto]);
if (target <= docIDs[nextUpto]) {
low = nextUpto - inc;
high = nextUpto;
break;
}
inc *= 2;
nextUpto += inc;
}
// Now do normal binary search
// System.out.println(" after fwd: low=" + low + " high=" + high);
while (true) {
if (low > high) {
// Not exactly found
// System.out.println(" break: no match");
upto = low;
break;
}
int mid = (low + high) >>> 1;
int cmp = docIDs[mid] - target;
// System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" +
// docIDs[mid]);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// Found target
upto = mid;
// System.out.println(" break: match");
break;
}
}
// System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ?
// NO_MORE_DOCS : docIDs[upto]));
if (upto == docIDs.length) {
// System.out.println(" return END");
return docID = NO_MORE_DOCS;
} else {
// System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto);
posUpto = -posJump;
curPositions = positions[upto];
return docID = docIDs[upto];
}
}
private final BytesRef payload = new BytesRef();
@Override
public BytesRef getPayload() {
if (payloads == null) {
return null;
} else {
final byte[] payloadBytes = payloads[upto][posUpto / (hasOffsets ? 3 : 1)];
if (payloadBytes == null) {
return null;
}
payload.bytes = payloadBytes;
payload.length = payloadBytes.length;
payload.offset = 0;
return payload;
}
}
@Override
public long cost() {
return docIDs.length;
}
}
}