blob: 5c6a00ee2c50b946570f6ca86cd56d306235227f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.memory;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
// TODO:
// - build depth-N prefix hash?
// - or: longer dense skip lists than just next byte?
/** Wraps {@link Lucene84PostingsFormat} format for on-disk
* storage, but then at read time loads and stores all
* terms and postings directly in RAM as byte[], int[].
*
* <p><b>WARNING</b>: This is
* exceptionally RAM intensive: it makes no effort to
* compress the postings data, storing terms as separate
* byte[] and postings as separate int[], but as a result it
* gives substantial increase in search performance.
*
* <p>This postings format supports {@link TermsEnum#ord}
* and {@link TermsEnum#seekExact(long)}.
* <p>Because this holds all term bytes as a single
* byte[], you cannot have more than 2.1GB worth of term
* bytes in a single segment.
*
* @lucene.experimental */
public final class DirectPostingsFormat extends PostingsFormat {
private final int minSkipCount;
private final int lowFreqCutoff;
private final static int DEFAULT_MIN_SKIP_COUNT = 8;
private final static int DEFAULT_LOW_FREQ_CUTOFF = 32;
//private static final boolean DEBUG = true;
// TODO: allow passing/wrapping arbitrary postings format?
public DirectPostingsFormat() {
this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF);
}
/** minSkipCount is how many terms in a row must have the
* same prefix before we put a skip pointer down. Terms
* with docFreq &lt;= lowFreqCutoff will use a single int[]
* to hold all docs, freqs, position and offsets; terms
* with higher docFreq will use separate arrays. */
public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) {
super("Direct");
this.minSkipCount = minSkipCount;
this.lowFreqCutoff = lowFreqCutoff;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return PostingsFormat.forName("Lucene84").fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
FieldsProducer postings = PostingsFormat.forName("Lucene84").fieldsProducer(state);
if (state.context.context != IOContext.Context.MERGE) {
FieldsProducer loadedPostings;
try {
postings.checkIntegrity();
loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff);
} finally {
postings.close();
}
return loadedPostings;
} else {
// Don't load postings for merge:
return postings;
}
}
private static final class DirectFields extends FieldsProducer {
private final Map<String,DirectField> fields = new TreeMap<>();
public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) throws IOException {
for (String field : fields) {
this.fields.put(field, new DirectField(state, field, fields.terms(field), minSkipCount, lowFreqCutoff));
}
}
@Override
public Iterator<String> iterator() {
return Collections.unmodifiableSet(fields.keySet()).iterator();
}
@Override
public Terms terms(String field) {
return fields.get(field);
}
@Override
public int size() {
return fields.size();
}
@Override
public void close() {
}
@Override
public long ramBytesUsed() {
long sizeInBytes = 0;
for(Map.Entry<String,DirectField> entry: fields.entrySet()) {
sizeInBytes += entry.getKey().length() * Character.BYTES;
sizeInBytes += entry.getValue().ramBytesUsed();
}
return sizeInBytes;
}
@Override
public Collection<Accountable> getChildResources() {
return Accountables.namedAccountables("field", fields);
}
@Override
public void checkIntegrity() throws IOException {
// if we read entirely into ram, we already validated.
// otherwise returned the raw postings reader
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + fields.size() + ")";
}
}
private final static class DirectField extends Terms implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(DirectField.class);
private static abstract class TermAndSkip implements Accountable {
public int[] skips;
}
private static final class LowFreqTerm extends TermAndSkip {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);
public final int[] postings;
public final byte[] payloads;
public final int docFreq;
public final int totalTermFreq;
public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFreq) {
this.postings = postings;
this.payloads = payloads;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED +
((postings!=null) ? RamUsageEstimator.sizeOf(postings) : 0) +
((payloads!=null) ? RamUsageEstimator.sizeOf(payloads) : 0);
}
}
// TODO: maybe specialize into prx/no-prx/no-frq cases?
private static final class HighFreqTerm extends TermAndSkip {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HighFreqTerm.class);
public final long totalTermFreq;
public final int[] docIDs;
public final int[] freqs;
public final int[][] positions;
public final byte[][][] payloads;
public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads, long totalTermFreq) {
this.docIDs = docIDs;
this.freqs = freqs;
this.positions = positions;
this.payloads = payloads;
this.totalTermFreq = totalTermFreq;
}
@Override
public long ramBytesUsed() {
long sizeInBytes = BASE_RAM_BYTES_USED;
sizeInBytes += (docIDs!=null)? RamUsageEstimator.sizeOf(docIDs) : 0;
sizeInBytes += (freqs!=null)? RamUsageEstimator.sizeOf(freqs) : 0;
if(positions != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(positions);
for(int[] position : positions) {
sizeInBytes += (position!=null) ? RamUsageEstimator.sizeOf(position) : 0;
}
}
if (payloads != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(payloads);
for(byte[][] payload : payloads) {
if(payload != null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(payload);
for(byte[] pload : payload) {
sizeInBytes += (pload!=null) ? RamUsageEstimator.sizeOf(pload) : 0;
}
}
}
}
return sizeInBytes;
}
}
private final byte[] termBytes;
private final int[] termOffsets;
private final int[] skips;
private final int[] skipOffsets;
private final TermAndSkip[] terms;
private final boolean hasFreq;
private final boolean hasPos;
private final boolean hasOffsets;
private final boolean hasPayloads;
private final long sumTotalTermFreq;
private final int docCount;
private final long sumDocFreq;
private int skipCount;
// TODO: maybe make a separate builder? These are only
// used during load:
private int count;
private int[] sameCounts = new int[10];
private final int minSkipCount;
private final static class IntArrayWriter {
private int[] ints = new int[10];
private int upto;
public void add(int value) {
if (ints.length == upto) {
ints = ArrayUtil.grow(ints);
}
ints[upto++] = value;
}
public int[] get() {
final int[] arr = new int[upto];
System.arraycopy(ints, 0, arr, 0, upto);
upto = 0;
return arr;
}
}
public DirectField(SegmentReadState state, String field, Terms termsIn, int minSkipCount, int lowFreqCutoff) throws IOException {
final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
sumTotalTermFreq = termsIn.getSumTotalTermFreq();
sumDocFreq = termsIn.getSumDocFreq();
docCount = termsIn.getDocCount();
final int numTerms = (int) termsIn.size();
if (numTerms == -1) {
throw new IllegalArgumentException("codec does not provide Terms.size()");
}
terms = new TermAndSkip[numTerms];
termOffsets = new int[1+numTerms];
byte[] termBytes = new byte[1024];
this.minSkipCount = minSkipCount;
hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS) > 0;
hasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0;
hasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0;
hasPayloads = fieldInfo.hasPayloads();
BytesRef term;
PostingsEnum postingsEnum = null;
PostingsEnum docsAndPositionsEnum = null;
final TermsEnum termsEnum = termsIn.iterator();
int termOffset = 0;
final IntArrayWriter scratch = new IntArrayWriter();
// Used for payloads, if any:
final RAMOutputStream ros = new RAMOutputStream();
// if (DEBUG) {
// System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads);
// }
while ((term = termsEnum.next()) != null) {
final int docFreq = termsEnum.docFreq();
final long totalTermFreq = termsEnum.totalTermFreq();
// if (DEBUG) {
// System.out.println(" term=" + term.utf8ToString());
// }
termOffsets[count] = termOffset;
if (termBytes.length < (termOffset + term.length)) {
termBytes = ArrayUtil.grow(termBytes, termOffset + term.length);
}
System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length);
termOffset += term.length;
termOffsets[count+1] = termOffset;
if (hasPos) {
docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.ALL);
} else {
postingsEnum = termsEnum.postings(postingsEnum);
}
final TermAndSkip ent;
final PostingsEnum postingsEnum2;
if (hasPos) {
postingsEnum2 = docsAndPositionsEnum;
} else {
postingsEnum2 = postingsEnum;
}
int docID;
if (docFreq <= lowFreqCutoff) {
ros.reset();
// Pack postings for low-freq terms into a single int[]:
while ((docID = postingsEnum2.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
scratch.add(docID);
if (hasFreq) {
final int freq = postingsEnum2.freq();
scratch.add(freq);
if (hasPos) {
for(int pos=0;pos<freq;pos++) {
scratch.add(docsAndPositionsEnum.nextPosition());
if (hasOffsets) {
scratch.add(docsAndPositionsEnum.startOffset());
scratch.add(docsAndPositionsEnum.endOffset());
}
if (hasPayloads) {
final BytesRef payload = docsAndPositionsEnum.getPayload();
if (payload != null) {
scratch.add(payload.length);
ros.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
scratch.add(0);
}
}
}
}
}
}
final byte[] payloads;
if (hasPayloads) {
payloads = new byte[(int) ros.getFilePointer()];
ros.writeTo(payloads, 0);
} else {
payloads = null;
}
final int[] postings = scratch.get();
ent = new LowFreqTerm(postings, payloads, docFreq, (int) totalTermFreq);
} else {
final int[] docs = new int[docFreq];
final int[] freqs;
final int[][] positions;
final byte[][][] payloads;
if (hasFreq) {
freqs = new int[docFreq];
if (hasPos) {
positions = new int[docFreq][];
if (hasPayloads) {
payloads = new byte[docFreq][][];
} else {
payloads = null;
}
} else {
positions = null;
payloads = null;
}
} else {
freqs = null;
positions = null;
payloads = null;
}
// Use separate int[] for the postings for high-freq
// terms:
int upto = 0;
while ((docID = postingsEnum2.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
docs[upto] = docID;
if (hasFreq) {
final int freq = postingsEnum2.freq();
freqs[upto] = freq;
if (hasPos) {
final int mult;
if (hasOffsets) {
mult = 3;
} else {
mult = 1;
}
if (hasPayloads) {
payloads[upto] = new byte[freq][];
}
positions[upto] = new int[mult*freq];
int posUpto = 0;
for(int pos=0;pos<freq;pos++) {
positions[upto][posUpto] = docsAndPositionsEnum.nextPosition();
if (hasPayloads) {
BytesRef payload = docsAndPositionsEnum.getPayload();
if (payload != null) {
byte[] payloadBytes = new byte[payload.length];
System.arraycopy(payload.bytes, payload.offset, payloadBytes, 0, payload.length);
payloads[upto][pos] = payloadBytes;
}
}
posUpto++;
if (hasOffsets) {
positions[upto][posUpto++] = docsAndPositionsEnum.startOffset();
positions[upto][posUpto++] = docsAndPositionsEnum.endOffset();
}
}
}
}
upto++;
}
assert upto == docFreq;
ent = new HighFreqTerm(docs, freqs, positions, payloads, totalTermFreq);
}
terms[count] = ent;
setSkips(count, termBytes);
count++;
}
// End sentinel:
termOffsets[count] = termOffset;
finishSkips();
//System.out.println(skipCount + " skips: " + field);
this.termBytes = new byte[termOffset];
System.arraycopy(termBytes, 0, this.termBytes, 0, termOffset);
// Pack skips:
this.skips = new int[skipCount];
this.skipOffsets = new int[1+numTerms];
int skipOffset = 0;
for(int i=0;i<numTerms;i++) {
final int[] termSkips = terms[i].skips;
skipOffsets[i] = skipOffset;
if (termSkips != null) {
System.arraycopy(termSkips, 0, skips, skipOffset, termSkips.length);
skipOffset += termSkips.length;
terms[i].skips = null;
}
}
this.skipOffsets[numTerms] = skipOffset;
assert skipOffset == skipCount;
}
@Override
public long ramBytesUsed() {
long sizeInBytes = BASE_RAM_BYTES_USED;
sizeInBytes += ((termBytes!=null) ? RamUsageEstimator.sizeOf(termBytes) : 0);
sizeInBytes += ((termOffsets!=null) ? RamUsageEstimator.sizeOf(termOffsets) : 0);
sizeInBytes += ((skips!=null) ? RamUsageEstimator.sizeOf(skips) : 0);
sizeInBytes += ((skipOffsets!=null) ? RamUsageEstimator.sizeOf(skipOffsets) : 0);
sizeInBytes += ((sameCounts!=null) ? RamUsageEstimator.sizeOf(sameCounts) : 0);
if(terms!=null) {
sizeInBytes += RamUsageEstimator.shallowSizeOf(terms);
for(TermAndSkip termAndSkip : terms) {
sizeInBytes += (termAndSkip!=null) ? termAndSkip.ramBytesUsed() : 0;
}
}
return sizeInBytes;
}
@Override
public String toString() {
return "DirectTerms(terms=" + terms.length + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")";
}
// Compares in unicode (UTF8) order:
int compare(int ord, BytesRef other) {
final byte[] otherBytes = other.bytes;
int upto = termOffsets[ord];
final int termLen = termOffsets[1+ord] - upto;
int otherUpto = other.offset;
final int stop = upto + Math.min(termLen, other.length);
while (upto < stop) {
int diff = (termBytes[upto++] & 0xFF) - (otherBytes[otherUpto++] & 0xFF);
if (diff != 0) {
return diff;
}
}
// One is a prefix of the other, or, they are equal:
return termLen - other.length;
}
private void setSkips(int termOrd, byte[] termBytes) {
final int termLength = termOffsets[termOrd+1] - termOffsets[termOrd];
if (sameCounts.length < termLength) {
sameCounts = ArrayUtil.grow(sameCounts, termLength);
}
// Update skip pointers:
if (termOrd > 0) {
final int lastTermLength = termOffsets[termOrd] - termOffsets[termOrd-1];
final int limit = Math.min(termLength, lastTermLength);
int lastTermOffset = termOffsets[termOrd-1];
int termOffset = termOffsets[termOrd];
int i = 0;
for(;i<limit;i++) {
if (termBytes[lastTermOffset++] == termBytes[termOffset++]) {
sameCounts[i]++;
} else {
for(;i<limit;i++) {
if (sameCounts[i] >= minSkipCount) {
// Go back and add a skip pointer:
saveSkip(termOrd, sameCounts[i]);
}
sameCounts[i] = 1;
}
break;
}
}
for(;i<lastTermLength;i++) {
if (sameCounts[i] >= minSkipCount) {
// Go back and add a skip pointer:
saveSkip(termOrd, sameCounts[i]);
}
sameCounts[i] = 0;
}
for(int j=limit;j<termLength;j++) {
sameCounts[j] = 1;
}
} else {
for(int i=0;i<termLength;i++) {
sameCounts[i]++;
}
}
}
private void finishSkips() {
assert count == terms.length;
int lastTermOffset = termOffsets[count-1];
int lastTermLength = termOffsets[count] - lastTermOffset;
for(int i=0;i<lastTermLength;i++) {
if (sameCounts[i] >= minSkipCount) {
// Go back and add a skip pointer:
saveSkip(count, sameCounts[i]);
}
}
// Reverse the skip pointers so they are "nested":
for(int termID=0;termID<terms.length;termID++) {
TermAndSkip term = terms[termID];
if (term.skips != null && term.skips.length > 1) {
for(int pos=0;pos<term.skips.length/2;pos++) {
final int otherPos = term.skips.length-pos-1;
final int temp = term.skips[pos];
term.skips[pos] = term.skips[otherPos];
term.skips[otherPos] = temp;
}
}
}
}
private void saveSkip(int ord, int backCount) {
final TermAndSkip term = terms[ord - backCount];
skipCount++;
if (term.skips == null) {
term.skips = new int[] {ord};
} else {
// Normally we'd grow at a slight exponential... but
// given that the skips themselves are already log(N)
// we can grow by only 1 and still have amortized
// linear time:
final int[] newSkips = new int[term.skips.length+1];
System.arraycopy(term.skips, 0, newSkips, 0, term.skips.length);
term.skips = newSkips;
term.skips[term.skips.length-1] = ord;
}
}
@Override
public TermsEnum iterator() {
return new DirectTermsEnum();
}
@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new DirectIntersectTermsEnum(compiled, startTerm);
}
@Override
public long size() {
return terms.length;
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() {
return sumDocFreq;
}
@Override
public int getDocCount() {
return docCount;
}
@Override
public boolean hasFreqs() {
return hasFreq;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPos;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
private final class DirectTermsEnum extends BaseTermsEnum {
private final BytesRef scratch = new BytesRef();
private int termOrd;
private DirectTermsEnum() {
termOrd = -1;
}
private BytesRef setTerm() {
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[termOrd+1] - termOffsets[termOrd];
return scratch;
}
@Override
public BytesRef next() {
termOrd++;
if (termOrd < terms.length) {
return setTerm();
} else {
return null;
}
}
@Override
public TermState termState() {
OrdTermState state = new OrdTermState();
state.ord = termOrd;
return state;
}
// If non-negative, exact match; else, -ord-1, where ord
// is where you would insert the term.
private int findTerm(BytesRef term) {
// Just do binary search: should be (constant factor)
// faster than using the skip list:
int low = 0;
int high = terms.length-1;
while (low <= high) {
int mid = (low + high) >>> 1;
int cmp = compare(mid, term);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid; // key found
}
}
return -(low + 1); // key not found.
}
@Override
public SeekStatus seekCeil(BytesRef term) {
// TODO: we should use the skip pointers; should be
// faster than bin search; we should also hold
// & reuse current state so seeking forwards is
// faster
final int ord = findTerm(term);
// if (DEBUG) {
// System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord);
// }
if (ord >= 0) {
termOrd = ord;
setTerm();
return SeekStatus.FOUND;
} else if (ord == -terms.length-1) {
return SeekStatus.END;
} else {
termOrd = -ord - 1;
setTerm();
return SeekStatus.NOT_FOUND;
}
}
@Override
public boolean seekExact(BytesRef term) {
// TODO: we should use the skip pointers; should be
// faster than bin search; we should also hold
// & reuse current state so seeking forwards is
// faster
final int ord = findTerm(term);
if (ord >= 0) {
termOrd = ord;
setTerm();
return true;
} else {
return false;
}
}
@Override
public void seekExact(long ord) {
termOrd = (int) ord;
setTerm();
}
@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
termOrd = (int) ((OrdTermState) state).ord;
setTerm();
assert term.equals(scratch);
}
@Override
public BytesRef term() {
return scratch;
}
@Override
public long ord() {
return termOrd;
}
@Override
public int docFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).docFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).docIDs.length;
}
}
@Override
public long totalTermFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).totalTermFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).totalTermFreq;
}
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
// TODO: implement reuse
// it's hairy!
// TODO: the logic of which enum impl to choose should be refactored to be simpler...
if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
if (terms[termOrd] instanceof LowFreqTerm) {
final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]);
final int[] postings = term.postings;
if (hasFreq == false) {
LowFreqDocsEnumNoTF docsEnum;
if (reuse instanceof LowFreqDocsEnumNoTF) {
docsEnum = (LowFreqDocsEnumNoTF) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoTF();
}
return docsEnum.reset(postings);
} else if (hasPos == false) {
LowFreqDocsEnumNoPos docsEnum;
if (reuse instanceof LowFreqDocsEnumNoPos) {
docsEnum = (LowFreqDocsEnumNoPos) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoPos();
}
return docsEnum.reset(postings);
}
final byte[] payloads = term.payloads;
return new LowFreqPostingsEnum(hasOffsets, hasPayloads).reset(postings, payloads);
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
if (hasPos == false) {
return new HighFreqDocsEnum().reset(term.docIDs, term.freqs);
} else {
return new HighFreqPostingsEnum(hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads);
}
}
}
if (terms[termOrd] instanceof LowFreqTerm) {
final int[] postings = ((LowFreqTerm) terms[termOrd]).postings;
if (hasFreq) {
if (hasPos) {
int posLen;
if (hasOffsets) {
posLen = 3;
} else {
posLen = 1;
}
if (hasPayloads) {
posLen++;
}
LowFreqDocsEnum docsEnum;
if (reuse instanceof LowFreqDocsEnum) {
docsEnum = (LowFreqDocsEnum) reuse;
if (!docsEnum.canReuse(posLen)) {
docsEnum = new LowFreqDocsEnum(posLen);
}
} else {
docsEnum = new LowFreqDocsEnum(posLen);
}
return docsEnum.reset(postings);
} else {
LowFreqDocsEnumNoPos docsEnum;
if (reuse instanceof LowFreqDocsEnumNoPos) {
docsEnum = (LowFreqDocsEnumNoPos) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoPos();
}
return docsEnum.reset(postings);
}
} else {
LowFreqDocsEnumNoTF docsEnum;
if (reuse instanceof LowFreqDocsEnumNoTF) {
docsEnum = (LowFreqDocsEnumNoTF) reuse;
} else {
docsEnum = new LowFreqDocsEnumNoTF();
}
return docsEnum.reset(postings);
}
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
HighFreqDocsEnum docsEnum;
if (reuse instanceof HighFreqDocsEnum) {
docsEnum = (HighFreqDocsEnum) reuse;
} else {
docsEnum = new HighFreqDocsEnum();
}
//System.out.println(" DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs");
return docsEnum.reset(term.docIDs, term.freqs);
}
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
return new SlowImpactsEnum(postings(null, flags));
}
}
private final class DirectIntersectTermsEnum extends BaseTermsEnum {
private final RunAutomaton runAutomaton;
private final CompiledAutomaton compiledAutomaton;
private int termOrd;
private final BytesRef scratch = new BytesRef();
private final class State {
int changeOrd;
int state;
int transitionUpto;
int transitionCount;
int transitionMax;
int transitionMin;
final Transition transition = new Transition();
}
private State[] states;
private int stateUpto;
public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) {
runAutomaton = compiled.runAutomaton;
compiledAutomaton = compiled;
termOrd = -1;
states = new State[1];
states[0] = new State();
states[0].changeOrd = terms.length;
states[0].state = 0;
states[0].transitionCount = compiledAutomaton.automaton.getNumTransitions(states[0].state);
compiledAutomaton.automaton.initTransition(states[0].state, states[0].transition);
states[0].transitionUpto = -1;
states[0].transitionMax = -1;
//System.out.println("IE.init startTerm=" + startTerm);
if (startTerm != null) {
int skipUpto = 0;
if (startTerm.length == 0) {
if (terms.length > 0 && termOffsets[1] == 0) {
termOrd = 0;
}
} else {
termOrd++;
nextLabel:
for(int i=0;i<startTerm.length;i++) {
final int label = startTerm.bytes[startTerm.offset+i] & 0xFF;
while (label > states[i].transitionMax) {
states[i].transitionUpto++;
assert states[i].transitionUpto < states[i].transitionCount;
compiledAutomaton.automaton.getNextTransition(states[i].transition);
states[i].transitionMin = states[i].transition.min;
states[i].transitionMax = states[i].transition.max;
assert states[i].transitionMin >= 0;
assert states[i].transitionMin <= 255;
assert states[i].transitionMax >= 0;
assert states[i].transitionMax <= 255;
}
// Skip forwards until we find a term matching
// the label at this position:
while (termOrd < terms.length) {
final int skipOffset = skipOffsets[termOrd];
final int numSkips = skipOffsets[termOrd+1] - skipOffset;
final int termOffset = termOffsets[termOrd];
final int termLength = termOffsets[1+termOrd] - termOffset;
// if (DEBUG) {
// System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i);
// }
if (termOrd == states[stateUpto].changeOrd) {
// if (DEBUG) {
// System.out.println(" end push return");
// }
stateUpto--;
termOrd--;
return;
}
if (termLength == i) {
termOrd++;
skipUpto = 0;
// if (DEBUG) {
// System.out.println(" term too short; next term");
// }
} else if (label < (termBytes[termOffset+i] & 0xFF)) {
termOrd--;
// if (DEBUG) {
// System.out.println(" no match; already beyond; return termOrd=" + termOrd);
// }
stateUpto -= skipUpto;
assert stateUpto >= 0;
return;
} else if (label == (termBytes[termOffset+i] & 0xFF)) {
// if (DEBUG) {
// System.out.println(" label[" + i + "] matches");
// }
if (skipUpto < numSkips) {
grow();
final int nextState = runAutomaton.step(states[stateUpto].state, label);
// Automaton is required to accept startTerm:
assert nextState != -1;
stateUpto++;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].state = nextState;
states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState);
compiledAutomaton.automaton.initTransition(states[stateUpto].state, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
//System.out.println(" push " + states[stateUpto].transitions.length + " trans");
// if (DEBUG) {
// System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd);
// }
// Match next label at this same term:
continue nextLabel;
} else {
// if (DEBUG) {
// System.out.println(" linear scan");
// }
// Index exhausted: just scan now (the
// number of scans required will be less
// than the minSkipCount):
final int startTermOrd = termOrd;
while (termOrd < terms.length && compare(termOrd, startTerm) <= 0) {
assert termOrd == startTermOrd || skipOffsets[termOrd] == skipOffsets[termOrd+1];
termOrd++;
}
assert termOrd - startTermOrd < minSkipCount;
termOrd--;
stateUpto -= skipUpto;
// if (DEBUG) {
// System.out.println(" end termOrd=" + termOrd);
// }
return;
}
} else {
if (skipUpto < numSkips) {
termOrd = skips[skipOffset + skipUpto];
// if (DEBUG) {
// System.out.println(" no match; skip to termOrd=" + termOrd);
// }
} else {
// if (DEBUG) {
// System.out.println(" no match; next term");
// }
termOrd++;
}
skipUpto = 0;
}
}
// startTerm is >= last term so enum will not
// return any terms:
termOrd--;
// if (DEBUG) {
// System.out.println(" beyond end; no terms will match");
// }
return;
}
}
final int termOffset = termOffsets[termOrd];
final int termLen = termOffsets[1+termOrd] - termOffset;
if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) {
stateUpto -= skipUpto;
termOrd--;
}
// if (DEBUG) {
// System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto);
// }
}
}
private void grow() {
if (states.length == 1+stateUpto) {
final State[] newStates = new State[states.length+1];
System.arraycopy(states, 0, newStates, 0, states.length);
newStates[states.length] = new State();
states = newStates;
}
}
@Override
public BytesRef next() {
// if (DEBUG) {
// System.out.println("\nIE.next");
// }
termOrd++;
int skipUpto = 0;
if (termOrd == 0 && termOffsets[1] == 0) {
// Special-case empty string:
assert stateUpto == 0;
// if (DEBUG) {
// System.out.println(" visit empty string");
// }
if (runAutomaton.isAccept(states[0].state)) {
scratch.bytes = termBytes;
scratch.offset = 0;
scratch.length = 0;
return scratch;
}
termOrd++;
}
nextTerm:
while (true) {
// if (DEBUG) {
// System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + " skipUpto=" + skipUpto);
// }
if (termOrd == terms.length) {
// if (DEBUG) {
// System.out.println(" return END");
// }
return null;
}
final State state = states[stateUpto];
if (termOrd == state.changeOrd) {
// Pop:
// if (DEBUG) {
// System.out.println(" pop stateUpto=" + stateUpto);
// }
stateUpto--;
/*
if (DEBUG) {
try {
//System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)).utf8ToString());
System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)));
} catch (ArrayIndexOutOfBoundsException aioobe) {
System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)));
}
}
*/
continue;
}
final int termOffset = termOffsets[termOrd];
final int termLength = termOffsets[termOrd+1] - termOffset;
final int skipOffset = skipOffsets[termOrd];
final int numSkips = skipOffsets[termOrd+1] - skipOffset;
// if (DEBUG) {
// System.out.println(" term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips));
// }
assert termOrd < state.changeOrd;
assert stateUpto <= termLength: "term.length=" + termLength + "; stateUpto=" + stateUpto;
final int label = termBytes[termOffset+stateUpto] & 0xFF;
while (label > state.transitionMax) {
//System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length);
state.transitionUpto++;
if (state.transitionUpto == state.transitionCount) {
// We've exhausted transitions leaving this
// state; force pop+next/skip now:
//System.out.println("forcepop: stateUpto=" + stateUpto);
if (stateUpto == 0) {
termOrd = terms.length;
return null;
} else {
assert state.changeOrd > termOrd;
// if (DEBUG) {
// System.out.println(" jumpend " + (state.changeOrd - termOrd));
// }
//System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " + termOrd);
termOrd = states[stateUpto].changeOrd;
skipUpto = 0;
stateUpto--;
}
continue nextTerm;
}
compiledAutomaton.automaton.getNextTransition(state.transition);
assert state.transitionUpto < state.transitionCount: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitionCount;
state.transitionMin = state.transition.min;
state.transitionMax = state.transition.max;
assert state.transitionMin >= 0;
assert state.transitionMin <= 255;
assert state.transitionMax >= 0;
assert state.transitionMax <= 255;
}
/*
if (DEBUG) {
System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + (char) label + "(" + label + ") term=" + new BytesRef(terms[termOrd].term).utf8ToString() + " trans " +
(char) state.transitionMin + "(" + state.transitionMin + ")" + "-" + (char) state.transitionMax + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips)));
System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + Integer.toHexString(label) + "(" + label + ") term=" + new BytesRef(termBytes, termOffset, termLength) + " trans " +
Integer.toHexString(state.transitionMin) + "(" + state.transitionMin + ")" + "-" + Integer.toHexString(state.transitionMax) + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips)));
}
*/
final int targetLabel = state.transitionMin;
if ((termBytes[termOffset+stateUpto] & 0xFF) < targetLabel) {
// if (DEBUG) {
// System.out.println(" do bin search");
// }
//int startTermOrd = termOrd;
int low = termOrd+1;
int high = state.changeOrd-1;
while (true) {
if (low > high) {
// Label not found
termOrd = low;
// if (DEBUG) {
// System.out.println(" advanced by " + (termOrd - startTermOrd));
// }
//System.out.println(" jump " + (termOrd - startTermOrd));
skipUpto = 0;
continue nextTerm;
}
int mid = (low + high) >>> 1;
int cmp = (termBytes[termOffsets[mid] + stateUpto] & 0xFF) - targetLabel;
// if (DEBUG) {
// System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low] + stateUpto] & 0xFF) + " ord=" + mid);
// }
if (cmp < 0) {
low = mid+1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// Label found; walk backwards to first
// occurrence:
while (mid > termOrd && (termBytes[termOffsets[mid-1] + stateUpto] & 0xFF) == targetLabel) {
mid--;
}
termOrd = mid;
// if (DEBUG) {
// System.out.println(" advanced by " + (termOrd - startTermOrd));
// }
//System.out.println(" jump " + (termOrd - startTermOrd));
skipUpto = 0;
continue nextTerm;
}
}
}
int nextState = runAutomaton.step(states[stateUpto].state, label);
if (nextState == -1) {
// Skip
// if (DEBUG) {
// System.out.println(" automaton doesn't accept; skip");
// }
if (skipUpto < numSkips) {
// if (DEBUG) {
// System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd));
// }
termOrd = skips[skipOffset+skipUpto];
} else {
termOrd++;
}
skipUpto = 0;
} else if (skipUpto < numSkips) {
// Push:
// if (DEBUG) {
// System.out.println(" push");
// }
/*
if (DEBUG) {
try {
//System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1).utf8ToString());
System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1));
} catch (ArrayIndexOutOfBoundsException aioobe) {
System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1));
}
}
*/
grow();
stateUpto++;
states[stateUpto].state = nextState;
states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
states[stateUpto].transitionCount = compiledAutomaton.automaton.getNumTransitions(nextState);
compiledAutomaton.automaton.initTransition(nextState, states[stateUpto].transition);
states[stateUpto].transitionUpto = -1;
states[stateUpto].transitionMax = -1;
if (stateUpto == termLength) {
// if (DEBUG) {
// System.out.println(" term ends after push");
// }
if (runAutomaton.isAccept(nextState)) {
// if (DEBUG) {
// System.out.println(" automaton accepts: return");
// }
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[1+termOrd] - scratch.offset;
// if (DEBUG) {
// System.out.println(" ret " + scratch.utf8ToString());
// }
return scratch;
} else {
// if (DEBUG) {
// System.out.println(" automaton rejects: nextTerm");
// }
termOrd++;
skipUpto = 0;
}
}
} else {
// Run the non-indexed tail of this term:
// TODO: add assert that we don't inc too many times
if (compiledAutomaton.commonSuffixRef != null) {
//System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString());
assert compiledAutomaton.commonSuffixRef.offset == 0;
if (termLength < compiledAutomaton.commonSuffixRef.length) {
termOrd++;
skipUpto = 0;
continue nextTerm;
}
int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length;
for(int suffix=0;suffix<compiledAutomaton.commonSuffixRef.length;suffix++) {
if (termBytes[offset + suffix] != compiledAutomaton.commonSuffixRef.bytes[suffix]) {
termOrd++;
skipUpto = 0;
continue nextTerm;
}
}
}
int upto = stateUpto+1;
while (upto < termLength) {
nextState = runAutomaton.step(nextState, termBytes[termOffset+upto] & 0xFF);
if (nextState == -1) {
termOrd++;
skipUpto = 0;
// if (DEBUG) {
// System.out.println(" nomatch tail; next term");
// }
continue nextTerm;
}
upto++;
}
if (runAutomaton.isAccept(nextState)) {
scratch.bytes = termBytes;
scratch.offset = termOffsets[termOrd];
scratch.length = termOffsets[1+termOrd] - scratch.offset;
// if (DEBUG) {
// System.out.println(" match tail; return " + scratch.utf8ToString());
// System.out.println(" ret2 " + scratch.utf8ToString());
// }
return scratch;
} else {
termOrd++;
skipUpto = 0;
// if (DEBUG) {
// System.out.println(" nomatch tail; next term");
// }
}
}
}
}
@Override
public TermState termState() {
OrdTermState state = new OrdTermState();
state.ord = termOrd;
return state;
}
@Override
public BytesRef term() {
return scratch;
}
@Override
public long ord() {
return termOrd;
}
@Override
public int docFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).docFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).docIDs.length;
}
}
@Override
public long totalTermFreq() {
if (terms[termOrd] instanceof LowFreqTerm) {
return ((LowFreqTerm) terms[termOrd]).totalTermFreq;
} else {
return ((HighFreqTerm) terms[termOrd]).totalTermFreq;
}
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) {
// TODO: implement reuse
// it's hairy!
// TODO: the logic of which enum impl to choose should be refactored to be simpler...
if (hasPos && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
if (terms[termOrd] instanceof LowFreqTerm) {
final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]);
final int[] postings = term.postings;
final byte[] payloads = term.payloads;
return new LowFreqPostingsEnum(hasOffsets, hasPayloads).reset(postings, payloads);
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
return new HighFreqPostingsEnum(hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads);
}
}
if (terms[termOrd] instanceof LowFreqTerm) {
final int[] postings = ((LowFreqTerm) terms[termOrd]).postings;
if (hasFreq) {
if (hasPos) {
int posLen;
if (hasOffsets) {
posLen = 3;
} else {
posLen = 1;
}
if (hasPayloads) {
posLen++;
}
return new LowFreqDocsEnum(posLen).reset(postings);
} else {
return new LowFreqDocsEnumNoPos().reset(postings);
}
} else {
return new LowFreqDocsEnumNoTF().reset(postings);
}
} else {
final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
// System.out.println("DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs");
return new HighFreqDocsEnum().reset(term.docIDs, term.freqs);
}
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
return new SlowImpactsEnum(postings(null, flags));
}
@Override
public SeekStatus seekCeil(BytesRef term) {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
}
}
// Docs only:
private final static class LowFreqDocsEnumNoTF extends PostingsEnum {
private int[] postings;
private int upto;
public PostingsEnum reset(int[] postings) {
this.postings = postings;
upto = -1;
return this;
}
// TODO: can do this w/o setting members?
@Override
public int nextDoc() {
upto++;
if (upto < postings.length) {
return postings[upto];
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
if (upto < 0) {
return -1;
} else if (upto < postings.length) {
return postings[upto];
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
return 1;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int advance(int target) throws IOException {
// Linear scan, but this is low-freq term so it won't
// be costly:
return slowAdvance(target);
}
@Override
public long cost() {
return postings.length;
}
}
// Docs + freqs:
private final static class LowFreqDocsEnumNoPos extends PostingsEnum {
private int[] postings;
private int upto;
public LowFreqDocsEnumNoPos() {}
public PostingsEnum reset(int[] postings) {
this.postings = postings;
upto = -2;
return this;
}
// TODO: can do this w/o setting members?
@Override
public int nextDoc() {
upto += 2;
if (upto < postings.length) {
return postings[upto];
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
if (upto < 0) {
return -1;
} else if (upto < postings.length) {
return postings[upto];
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
return postings[upto+1];
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int advance(int target) throws IOException {
// Linear scan, but this is low-freq term so it won't
// be costly:
return slowAdvance(target);
}
@Override
public long cost() {
return postings.length / 2;
}
}
// Docs + freqs + positions/offsets:
private final static class LowFreqDocsEnum extends PostingsEnum {
private int[] postings;
private final int posMult;
private int upto;
private int freq;
public LowFreqDocsEnum(int posMult) {
this.posMult = posMult;
// if (DEBUG) {
// System.out.println("LowFreqDE: posMult=" + posMult);
// }
}
public boolean canReuse(int posMult) {
return this.posMult == posMult;
}
public PostingsEnum reset(int[] postings) {
this.postings = postings;
upto = -2;
freq = 0;
return this;
}
// TODO: can do this w/o setting members?
@Override
public int nextDoc() {
upto += 2 + freq*posMult;
// if (DEBUG) {
// System.out.println(" nextDoc freq=" + freq + " upto=" + upto + " vs " + postings.length);
// }
if (upto < postings.length) {
freq = postings[upto+1];
assert freq > 0;
return postings[upto];
}
return NO_MORE_DOCS;
}
@Override
public int docID() {
// TODO: store docID member?
if (upto < 0) {
return -1;
} else if (upto < postings.length) {
return postings[upto];
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
// TODO: can I do postings[upto+1]?
return freq;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int advance(int target) throws IOException {
// Linear scan, but this is low-freq term so it won't
// be costly:
return slowAdvance(target);
}
@Override
public long cost() {
// TODO: could do a better estimate
return postings.length / 2;
}
}
private final static class LowFreqPostingsEnum extends PostingsEnum {
private int[] postings;
private final int posMult;
private final boolean hasOffsets;
private final boolean hasPayloads;
private final BytesRef payload = new BytesRef();
private int upto;
private int docID;
private int freq;
private int skipPositions;
private int pos;
private int startOffset;
private int endOffset;
private int lastPayloadOffset;
private int payloadOffset;
private int payloadLength;
private byte[] payloadBytes;
public LowFreqPostingsEnum(boolean hasOffsets, boolean hasPayloads) {
this.hasOffsets = hasOffsets;
this.hasPayloads = hasPayloads;
if (hasOffsets) {
if (hasPayloads) {
posMult = 4;
} else {
posMult = 3;
}
} else if (hasPayloads) {
posMult = 2;
} else {
posMult = 1;
}
}
public PostingsEnum reset(int[] postings, byte[] payloadBytes) {
this.postings = postings;
upto = 0;
skipPositions = 0;
pos = -1;
startOffset = -1;
endOffset = -1;
docID = -1;
payloadLength = 0;
this.payloadBytes = payloadBytes;
return this;
}
@Override
public int nextDoc() {
pos = -1;
if (hasPayloads) {
for(int i=0;i<skipPositions;i++) {
upto++;
if (hasOffsets) {
upto += 2;
}
payloadOffset += postings[upto++];
}
} else {
upto += posMult * skipPositions;
}
if (upto < postings.length) {
docID = postings[upto++];
freq = postings[upto++];
skipPositions = freq;
return docID;
}
return docID = NO_MORE_DOCS;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
return freq;
}
@Override
public int nextPosition() {
assert skipPositions > 0;
skipPositions--;
pos = postings[upto++];
if (hasOffsets) {
startOffset = postings[upto++];
endOffset = postings[upto++];
}
if (hasPayloads) {
payloadLength = postings[upto++];
lastPayloadOffset = payloadOffset;
payloadOffset += payloadLength;
}
return pos;
}
@Override
public int startOffset() {
return startOffset;
}
@Override
public int endOffset() {
return endOffset;
}
@Override
public int advance(int target) throws IOException {
return slowAdvance(target);
}
@Override
public BytesRef getPayload() {
if (payloadLength > 0) {
payload.bytes = payloadBytes;
payload.offset = lastPayloadOffset;
payload.length = payloadLength;
return payload;
} else {
return null;
}
}
@Override
public long cost() {
// TODO: could do a better estimate
return postings.length / 2;
}
}
// Docs + freqs:
private final static class HighFreqDocsEnum extends PostingsEnum {
private int[] docIDs;
private int[] freqs;
private int upto;
private int docID = -1;
public HighFreqDocsEnum() {}
public int[] getDocIDs() {
return docIDs;
}
public int[] getFreqs() {
return freqs;
}
public PostingsEnum reset(int[] docIDs, int[] freqs) {
this.docIDs = docIDs;
this.freqs = freqs;
docID = upto = -1;
return this;
}
@Override
public int nextDoc() {
upto++;
try {
return docID = docIDs[upto];
} catch (ArrayIndexOutOfBoundsException e) {
}
return docID = NO_MORE_DOCS;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
if (freqs == null) {
return 1;
} else {
return freqs[upto];
}
}
@Override
public int advance(int target) {
/*
upto++;
if (upto == docIDs.length) {
return docID = NO_MORE_DOCS;
}
final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target);
if (index < 0) {
upto = -index - 1;
} else {
upto = index;
}
if (liveDocs != null) {
while (upto < docIDs.length) {
if (liveDocs.get(docIDs[upto])) {
break;
}
upto++;
}
}
if (upto == docIDs.length) {
return NO_MORE_DOCS;
} else {
return docID = docIDs[upto];
}
*/
//System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length);
// if (DEBUG) {
// System.out.println("advance target=" + target + " len=" + docIDs.length);
// }
upto++;
if (upto == docIDs.length) {
return docID = NO_MORE_DOCS;
}
// First "grow" outwards, since most advances are to
// nearby docs:
int inc = 10;
int nextUpto = upto+10;
int low;
int high;
while (true) {
//System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc);
if (nextUpto >= docIDs.length) {
low = nextUpto-inc;
high = docIDs.length-1;
break;
}
//System.out.println(" docID=" + docIDs[nextUpto]);
if (target <= docIDs[nextUpto]) {
low = nextUpto-inc;
high = nextUpto;
break;
}
inc *= 2;
nextUpto += inc;
}
// Now do normal binary search
//System.out.println(" after fwd: low=" + low + " high=" + high);
while (true) {
if (low > high) {
// Not exactly found
//System.out.println(" break: no match");
upto = low;
break;
}
int mid = (low + high) >>> 1;
int cmp = docIDs[mid] - target;
//System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// Found target
upto = mid;
//System.out.println(" break: match");
break;
}
}
//System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto]));
if (upto == docIDs.length) {
//System.out.println(" return END");
return docID = NO_MORE_DOCS;
} else {
//System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto);
return docID = docIDs[upto];
}
}
@Override
public long cost() {
return docIDs.length;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
}
// TODO: specialize offsets and not
private final static class HighFreqPostingsEnum extends PostingsEnum {
private int[] docIDs;
private int[] freqs;
private int[][] positions;
private byte[][][] payloads;
private final boolean hasOffsets;
private final int posJump;
private int upto;
private int docID = -1;
private int posUpto;
private int[] curPositions;
public HighFreqPostingsEnum(boolean hasOffsets) {
this.hasOffsets = hasOffsets;
posJump = hasOffsets ? 3 : 1;
}
public int[] getDocIDs() {
return docIDs;
}
public int[][] getPositions() {
return positions;
}
public int getPosJump() {
return posJump;
}
public PostingsEnum reset(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads) {
this.docIDs = docIDs;
this.freqs = freqs;
this.positions = positions;
this.payloads = payloads;
upto = -1;
return this;
}
@Override
public int nextDoc() {
upto++;
if (upto < docIDs.length) {
posUpto = -posJump;
curPositions = positions[upto];
return docID = docIDs[upto];
}
return docID = NO_MORE_DOCS;
}
@Override
public int freq() {
return freqs[upto];
}
@Override
public int docID() {
return docID;
}
@Override
public int nextPosition() {
posUpto += posJump;
assert posUpto < curPositions.length;
return curPositions[posUpto];
}
@Override
public int startOffset() {
if (hasOffsets) {
return curPositions[posUpto+1];
} else {
return -1;
}
}
@Override
public int endOffset() {
if (hasOffsets) {
return curPositions[posUpto+2];
} else {
return -1;
}
}
@Override
public int advance(int target) {
/*
upto++;
if (upto == docIDs.length) {
return NO_MORE_DOCS;
}
final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target);
if (index < 0) {
upto = -index - 1;
} else {
upto = index;
}
if (liveDocs != null) {
while (upto < docIDs.length) {
if (liveDocs.get(docIDs[upto])) {
break;
}
upto++;
}
}
posUpto = hasOffsets ? -3 : -1;
if (upto == docIDs.length) {
return NO_MORE_DOCS;
} else {
return docID();
}
*/
//System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length);
// if (DEBUG) {
// System.out.println("advance target=" + target + " len=" + docIDs.length);
// }
upto++;
if (upto == docIDs.length) {
return docID = NO_MORE_DOCS;
}
// First "grow" outwards, since most advances are to
// nearby docs:
int inc = 10;
int nextUpto = upto+10;
int low;
int high;
while (true) {
//System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc);
if (nextUpto >= docIDs.length) {
low = nextUpto-inc;
high = docIDs.length-1;
break;
}
//System.out.println(" docID=" + docIDs[nextUpto]);
if (target <= docIDs[nextUpto]) {
low = nextUpto-inc;
high = nextUpto;
break;
}
inc *= 2;
nextUpto += inc;
}
// Now do normal binary search
//System.out.println(" after fwd: low=" + low + " high=" + high);
while (true) {
if (low > high) {
// Not exactly found
//System.out.println(" break: no match");
upto = low;
break;
}
int mid = (low + high) >>> 1;
int cmp = docIDs[mid] - target;
//System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
// Found target
upto = mid;
//System.out.println(" break: match");
break;
}
}
//System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto]));
if (upto == docIDs.length) {
//System.out.println(" return END");
return docID = NO_MORE_DOCS;
} else {
//System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto);
posUpto = -posJump;
curPositions = positions[upto];
return docID = docIDs[upto];
}
}
private final BytesRef payload = new BytesRef();
@Override
public BytesRef getPayload() {
if (payloads == null) {
return null;
} else {
final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)];
if (payloadBytes == null) {
return null;
}
payload.bytes = payloadBytes;
payload.length = payloadBytes.length;
payload.offset = 0;
return payload;
}
}
@Override
public long cost() {
return docIDs.length;
}
}
}