| Index: lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java (revision 0) |
| +++ lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java (working copy) |
| @@ -0,0 +1,2216 @@ |
| +package org.apache.lucene.codecs.memory; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.Comparator; |
| +import java.util.Iterator; |
| +import java.util.Map; |
| +import java.util.TreeMap; |
| + |
| +import org.apache.lucene.codecs.FieldsConsumer; |
| +import org.apache.lucene.codecs.FieldsProducer; |
| +import org.apache.lucene.codecs.PostingsFormat; |
| +import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; // javadocs |
| +import org.apache.lucene.index.DocsAndPositionsEnum; |
| +import org.apache.lucene.index.DocsEnum; |
| +import org.apache.lucene.index.FieldInfo.IndexOptions; |
| +import org.apache.lucene.index.FieldInfo; |
| +import org.apache.lucene.index.Fields; |
| +import org.apache.lucene.index.FieldsEnum; |
| +import org.apache.lucene.index.OrdTermState; |
| +import org.apache.lucene.index.SegmentReadState; |
| +import org.apache.lucene.index.SegmentWriteState; |
| +import org.apache.lucene.index.TermState; |
| +import org.apache.lucene.index.Terms; |
| +import org.apache.lucene.index.TermsEnum; |
| +import org.apache.lucene.store.IOContext; |
| +import org.apache.lucene.store.RAMOutputStream; |
| +import org.apache.lucene.util.ArrayUtil; |
| +import org.apache.lucene.util.Bits; |
| +import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.automaton.CompiledAutomaton; |
| +import org.apache.lucene.util.automaton.RunAutomaton; |
| +import org.apache.lucene.util.automaton.Transition; |
| + |
| +// TODO: |
| +// - build depth-N prefix hash? |
| +// - or: longer dense skip lists than just next byte? |
| + |
| +/** Wraps {@link Lucene40PostingsFormat} format for on-disk |
| + * storage, but then at read time loads and stores all |
| + * terms & postings directly in RAM as byte[], int[]. |
| + * |
| + * <p><b><font color=red>WARNING</font></b>: This is |
| + * exceptionally RAM intensive: it makes no effort to |
| + * compress the postings data, storing terms as separate |
| + * byte[] and postings as separate int[], but as a result it |
| + * gives substantial increase in search performance. |
| + * |
| + * <p>This postings format supports {@link TermsEnum#ord} |
| + * and {@link TermsEnum#seekExact(long)}. |
| + |
| + * <p>Because this holds all term bytes as a single |
| + * byte[], you cannot have more than 2.1GB worth of term |
| + * bytes in a single segment. |
| + * |
| + * @lucene.experimental */ |
| + |
| +public class DirectPostingsFormat extends PostingsFormat { |
| + |
| + private final int minSkipCount; |
| + private final int lowFreqCutoff; |
| + |
| + private final static int DEFAULT_MIN_SKIP_COUNT = 8; |
| + private final static int DEFAULT_LOW_FREQ_CUTOFF = 32; |
| + |
| + //private static final boolean DEBUG = true; |
| + |
| + // TODO: allow passing/wrapping arbitrary postings format? |
| + |
| + public DirectPostingsFormat() { |
| + this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF); |
| + } |
| + |
| + /** minSkipCount is how many terms in a row must have the |
| + * same prefix before we put a skip pointer down. Terms |
| + * with docFreq <= lowFreqCutoff will use a single int[] |
| + * to hold all docs, freqs, position and offsets; terms |
| + * with higher docFreq will use separate arrays. */ |
| + public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) { |
| + super("Direct"); |
| + this.minSkipCount = minSkipCount; |
| + this.lowFreqCutoff = lowFreqCutoff; |
| + } |
| + |
| + @Override |
| + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { |
| + return PostingsFormat.forName("Lucene40").fieldsConsumer(state); |
| + } |
| + |
| + @Override |
| + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { |
| + FieldsProducer postings = PostingsFormat.forName("Lucene40").fieldsProducer(state); |
| + if (state.context.context != IOContext.Context.MERGE) { |
| + FieldsProducer loadedPostings; |
| + try { |
| + loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff); |
| + } finally { |
| + postings.close(); |
| + } |
| + return loadedPostings; |
| + } else { |
| + // Don't load postings for merge: |
| + return postings; |
| + } |
| + } |
| + |
| + private static final class DirectFields extends FieldsProducer { |
| + private final Map<String,DirectField> fields = new TreeMap<String,DirectField>(); |
| + |
| + public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) throws IOException { |
| + FieldsEnum fieldsEnum = fields.iterator(); |
| + String field; |
| + while ((field = fieldsEnum.next()) != null) { |
| + this.fields.put(field, new DirectField(state, field, fieldsEnum.terms(), minSkipCount, lowFreqCutoff)); |
| + } |
| + } |
| + |
| + @Override |
| + public FieldsEnum iterator() { |
| + |
| + final Iterator<Map.Entry<String,DirectField>> iter = fields.entrySet().iterator(); |
| + |
| + return new FieldsEnum() { |
| + Map.Entry<String,DirectField> current; |
| + |
| + @Override |
| + public String next() { |
| + if (iter.hasNext()) { |
| + current = iter.next(); |
| + return current.getKey(); |
| + } else { |
| + return null; |
| + } |
| + } |
| + |
| + @Override |
| + public Terms terms() { |
| + return current.getValue(); |
| + } |
| + }; |
| + } |
| + |
| + @Override |
| + public Terms terms(String field) { |
| + return fields.get(field); |
| + } |
| + |
| + @Override |
| + public int size() { |
| + return fields.size(); |
| + } |
| + |
| + @Override |
| + public long getUniqueTermCount() { |
| + long numTerms = 0; |
| + for(DirectField field : fields.values()) { |
| + numTerms += field.terms.length; |
| + } |
| + return numTerms; |
| + } |
| + |
| + @Override |
| + public void close() { |
| + } |
| + } |
| + |
| + private final static class DirectField extends Terms { |
| + |
| + private static abstract class TermAndSkip { |
| + public int[] skips; |
| + } |
| + |
| + private static final class LowFreqTerm extends TermAndSkip { |
| + public final int[] postings; |
| + public final byte[] payloads; |
| + public final int docFreq; |
| + public final int totalTermFreq; |
| + |
| + public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFreq) { |
| + this.postings = postings; |
| + this.payloads = payloads; |
| + this.docFreq = docFreq; |
| + this.totalTermFreq = totalTermFreq; |
| + } |
| + } |
| + |
| + // TODO: maybe specialize into prx/no-prx/no-frq cases? |
| + private static final class HighFreqTerm extends TermAndSkip { |
| + public final long totalTermFreq; |
| + public final int[] docIDs; |
| + public final int[] freqs; |
| + public final int[][] positions; |
| + public final byte[][][] payloads; |
| + |
| + public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads, long totalTermFreq) { |
| + this.docIDs = docIDs; |
| + this.freqs = freqs; |
| + this.positions = positions; |
| + this.payloads = payloads; |
| + this.totalTermFreq = totalTermFreq; |
| + } |
| + } |
| + |
| + private final byte[] termBytes; |
| + private final int[] termOffsets; |
| + |
| + private final int[] skips; |
| + private final int[] skipOffsets; |
| + |
| + private final TermAndSkip[] terms; |
| + private final boolean hasFreq; |
| + private final boolean hasPos; |
| + private final boolean hasOffsets; |
| + private final boolean hasPayloads; |
| + private final long sumTotalTermFreq; |
| + private final int docCount; |
| + private final long sumDocFreq; |
| + private int skipCount; |
| + |
| + // TODO: maybe make a separate builder? These are only |
| + // used during load: |
| + private int count; |
| + private int[] sameCounts = new int[10]; |
| + private final int minSkipCount; |
| + |
| + private final static class IntArrayWriter { |
| + private int[] ints = new int[10]; |
| + private int upto; |
| + |
| + public void add(int value) { |
| + if (ints.length == upto) { |
| + ints = ArrayUtil.grow(ints); |
| + } |
| + ints[upto++] = value; |
| + } |
| + |
| + public int[] get() { |
| + final int[] arr = new int[upto]; |
| + System.arraycopy(ints, 0, arr, 0, upto); |
| + upto = 0; |
| + return arr; |
| + } |
| + } |
| + |
| + public DirectField(SegmentReadState state, String field, Terms termsIn, int minSkipCount, int lowFreqCutoff) throws IOException { |
| + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); |
| + |
| + sumTotalTermFreq = termsIn.getSumTotalTermFreq(); |
| + sumDocFreq = termsIn.getSumDocFreq(); |
| + docCount = termsIn.getDocCount(); |
| + |
| + final int numTerms = (int) termsIn.size(); |
| + if (numTerms == -1) { |
| + throw new IllegalArgumentException("codec does not provide Terms.size()"); |
| + } |
| + terms = new TermAndSkip[numTerms]; |
| + termOffsets = new int[1+numTerms]; |
| + |
| + byte[] termBytes = new byte[1024]; |
| + |
| + this.minSkipCount = minSkipCount; |
| + |
| + hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_ONLY) > 0; |
| + hasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0; |
| + hasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0; |
| + hasPayloads = fieldInfo.hasPayloads(); |
| + |
| + BytesRef term; |
| + DocsEnum docsEnum = null; |
| + DocsAndPositionsEnum docsAndPositionsEnum = null; |
| + final TermsEnum termsEnum = termsIn.iterator(null); |
| + int termOffset = 0; |
| + |
| + final IntArrayWriter scratch = new IntArrayWriter(); |
| + |
| + // Used for payloads, if any: |
| + final RAMOutputStream ros = new RAMOutputStream(); |
| + |
| + // if (DEBUG) { |
| + // System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads); |
| + // } |
| + |
| + while ((term = termsEnum.next()) != null) { |
| + final int docFreq = termsEnum.docFreq(); |
| + final long totalTermFreq = termsEnum.totalTermFreq(); |
| + |
| + // if (DEBUG) { |
| + // System.out.println(" term=" + term.utf8ToString()); |
| + // } |
| + |
| + termOffsets[count] = termOffset; |
| + |
| + if (termBytes.length < (termOffset + term.length)) { |
| + termBytes = ArrayUtil.grow(termBytes, termOffset + term.length); |
| + } |
| + System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length); |
| + termOffset += term.length; |
| + termOffsets[count+1] = termOffset; |
| + |
| + if (hasPos) { |
| + docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum, hasOffsets); |
| + } else { |
| + docsEnum = termsEnum.docs(null, docsEnum, hasFreq); |
| + } |
| + |
| + final TermAndSkip ent; |
| + |
| + final DocsEnum docsEnum2; |
| + if (hasPos) { |
| + docsEnum2 = docsAndPositionsEnum; |
| + } else { |
| + docsEnum2 = docsEnum; |
| + } |
| + |
| + int docID; |
| + |
| + if (docFreq <= lowFreqCutoff) { |
| + |
| + ros.reset(); |
| + |
| + // Pack postings for low-freq terms into a single int[]: |
| + while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) { |
| + scratch.add(docID); |
| + if (hasFreq) { |
| + final int freq = docsEnum2.freq(); |
| + scratch.add(freq); |
| + if (hasPos) { |
| + for(int pos=0;pos<freq;pos++) { |
| + scratch.add(docsAndPositionsEnum.nextPosition()); |
| + if (hasOffsets) { |
| + scratch.add(docsAndPositionsEnum.startOffset()); |
| + scratch.add(docsAndPositionsEnum.endOffset()); |
| + } |
| + if (hasPayloads) { |
| + final BytesRef payload; |
| + if (docsAndPositionsEnum.hasPayload()) { |
| + payload = docsAndPositionsEnum.getPayload(); |
| + scratch.add(payload.length); |
| + ros.writeBytes(payload.bytes, payload.offset, payload.length); |
| + } else { |
| + scratch.add(0); |
| + } |
| + } |
| + } |
| + } |
| + } |
| + } |
| + |
| + final byte[] payloads; |
| + if (hasPayloads) { |
| + ros.flush(); |
| + payloads = new byte[(int) ros.length()]; |
| + ros.writeTo(payloads, 0); |
| + } else { |
| + payloads = null; |
| + } |
| + |
| + final int[] postings = scratch.get(); |
| + |
| + ent = new LowFreqTerm(postings, payloads, docFreq, (int) totalTermFreq); |
| + } else { |
| + final int[] docs = new int[docFreq]; |
| + final int[] freqs; |
| + final int[][] positions; |
| + final byte[][][] payloads; |
| + if (hasFreq) { |
| + freqs = new int[docFreq]; |
| + if (hasPos) { |
| + positions = new int[docFreq][]; |
| + if (hasPayloads) { |
| + payloads = new byte[docFreq][][]; |
| + } else { |
| + payloads = null; |
| + } |
| + } else { |
| + positions = null; |
| + payloads = null; |
| + } |
| + } else { |
| + freqs = null; |
| + positions = null; |
| + payloads = null; |
| + } |
| + |
| + // Use separate int[] for the postings for high-freq |
| + // terms: |
| + int upto = 0; |
| + while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) { |
| + docs[upto] = docID; |
| + if (hasFreq) { |
| + final int freq = docsEnum2.freq(); |
| + freqs[upto] = freq; |
| + if (hasPos) { |
| + final int mult; |
| + if (hasOffsets) { |
| + mult = 3; |
| + } else { |
| + mult = 1; |
| + } |
| + if (hasPayloads) { |
| + payloads[upto] = new byte[freq][]; |
| + } |
| + positions[upto] = new int[mult*freq]; |
| + int posUpto = 0; |
| + for(int pos=0;pos<freq;pos++) { |
| + positions[upto][posUpto] = docsAndPositionsEnum.nextPosition(); |
| + if (hasPayloads) { |
| + if (docsAndPositionsEnum.hasPayload()) { |
| + BytesRef payload = docsAndPositionsEnum.getPayload(); |
| + assert payload != null; |
| + byte[] payloadBytes = new byte[payload.length]; |
| + System.arraycopy(payload.bytes, payload.offset, payloadBytes, 0, payload.length); |
| + payloads[upto][pos] = payloadBytes; |
| + } |
| + } |
| + posUpto++; |
| + if (hasOffsets) { |
| + positions[upto][posUpto++] = docsAndPositionsEnum.startOffset(); |
| + positions[upto][posUpto++] = docsAndPositionsEnum.endOffset(); |
| + } |
| + } |
| + } |
| + } |
| + |
| + upto++; |
| + } |
| + assert upto == docFreq; |
| + ent = new HighFreqTerm(docs, freqs, positions, payloads, totalTermFreq); |
| + } |
| + |
| + terms[count] = ent; |
| + setSkips(count, termBytes); |
| + count++; |
| + } |
| + |
| + // End sentinel: |
| + termOffsets[count] = termOffset; |
| + |
| + finishSkips(); |
| + |
| + //System.out.println(skipCount + " skips: " + field); |
| + |
| + this.termBytes = new byte[termOffset]; |
| + System.arraycopy(termBytes, 0, this.termBytes, 0, termOffset); |
| + |
| + // Pack skips: |
| + this.skips = new int[skipCount]; |
| + this.skipOffsets = new int[1+numTerms]; |
| + |
| + int skipOffset = 0; |
| + for(int i=0;i<numTerms;i++) { |
| + final int[] termSkips = terms[i].skips; |
| + skipOffsets[i] = skipOffset; |
| + if (termSkips != null) { |
| + System.arraycopy(termSkips, 0, skips, skipOffset, termSkips.length); |
| + skipOffset += termSkips.length; |
| + terms[i].skips = null; |
| + } |
| + } |
| + this.skipOffsets[numTerms] = skipOffset; |
| + assert skipOffset == skipCount; |
| + } |
| + |
| + // Compares in unicode (UTF8) order: |
| + int compare(int ord, BytesRef other) { |
| + final byte[] otherBytes = other.bytes; |
| + |
| + int upto = termOffsets[ord]; |
| + final int termLen = termOffsets[1+ord] - upto; |
| + int otherUpto = other.offset; |
| + |
| + final int stop = upto + Math.min(termLen, other.length); |
| + while (upto < stop) { |
| + int diff = (termBytes[upto++] & 0xFF) - (otherBytes[otherUpto++] & 0xFF); |
| + if (diff != 0) { |
| + return diff; |
| + } |
| + } |
| + |
| + // One is a prefix of the other, or, they are equal: |
| + return termLen - other.length; |
| + } |
| + |
| + private void setSkips(int termOrd, byte[] termBytes) { |
| + |
| + final int termLength = termOffsets[termOrd+1] - termOffsets[termOrd]; |
| + |
| + if (sameCounts.length < termLength) { |
| + sameCounts = ArrayUtil.grow(sameCounts, termLength); |
| + } |
| + |
| + // Update skip pointers: |
| + if (termOrd > 0) { |
| + final int lastTermLength = termOffsets[termOrd] - termOffsets[termOrd-1]; |
| + final int limit = Math.min(termLength, lastTermLength); |
| + |
| + int lastTermOffset = termOffsets[termOrd-1]; |
| + int termOffset = termOffsets[termOrd]; |
| + |
| + int i = 0; |
| + for(;i<limit;i++) { |
| + if (termBytes[lastTermOffset++] == termBytes[termOffset++]) { |
| + sameCounts[i]++; |
| + } else { |
| + for(;i<limit;i++) { |
| + if (sameCounts[i] >= minSkipCount) { |
| + // Go back and add a skip pointer: |
| + saveSkip(termOrd, sameCounts[i]); |
| + } |
| + sameCounts[i] = 1; |
| + } |
| + break; |
| + } |
| + } |
| + |
| + for(;i<lastTermLength;i++) { |
| + if (sameCounts[i] >= minSkipCount) { |
| + // Go back and add a skip pointer: |
| + saveSkip(termOrd, sameCounts[i]); |
| + } |
| + sameCounts[i] = 0; |
| + } |
| + for(int j=limit;j<termLength;j++) { |
| + sameCounts[j] = 1; |
| + } |
| + } else { |
| + for(int i=0;i<termLength;i++) { |
| + sameCounts[i]++; |
| + } |
| + } |
| + } |
| + |
| + private void finishSkips() { |
| + assert count == terms.length; |
| + int lastTermOffset = termOffsets[count-1]; |
| + int lastTermLength = termOffsets[count] - lastTermOffset; |
| + |
| + for(int i=0;i<lastTermLength;i++) { |
| + if (sameCounts[i] >= minSkipCount) { |
| + // Go back and add a skip pointer: |
| + saveSkip(count, sameCounts[i]); |
| + } |
| + } |
| + |
| + // Reverse the skip pointers so they are "nested": |
| + for(int termID=0;termID<terms.length;termID++) { |
| + TermAndSkip term = terms[termID]; |
| + if (term.skips != null && term.skips.length > 1) { |
| + for(int pos=0;pos<term.skips.length/2;pos++) { |
| + final int otherPos = term.skips.length-pos-1; |
| + |
| + final int temp = term.skips[pos]; |
| + term.skips[pos] = term.skips[otherPos]; |
| + term.skips[otherPos] = temp; |
| + } |
| + } |
| + } |
| + } |
| + |
| + private void saveSkip(int ord, int backCount) { |
| + final TermAndSkip term = terms[ord - backCount]; |
| + skipCount++; |
| + if (term.skips == null) { |
| + term.skips = new int[] {ord}; |
| + } else { |
| + // Normally we'd grow at a slight exponential... but |
| + // given that the skips themselves are already log(N) |
| + // we can grow by only 1 and still have amortized |
| + // linear time: |
| + final int[] newSkips = new int[term.skips.length+1]; |
| + System.arraycopy(term.skips, 0, newSkips, 0, term.skips.length); |
| + term.skips = newSkips; |
| + term.skips[term.skips.length-1] = ord; |
| + } |
| + } |
| + |
| + @Override |
| + public TermsEnum iterator(TermsEnum reuse) { |
| + DirectTermsEnum termsEnum; |
| + if (reuse != null && reuse instanceof DirectTermsEnum) { |
| + termsEnum = (DirectTermsEnum) reuse; |
| + if (!termsEnum.canReuse(terms)) { |
| + termsEnum = new DirectTermsEnum(); |
| + } |
| + } else { |
| + termsEnum = new DirectTermsEnum(); |
| + } |
| + termsEnum.reset(); |
| + return termsEnum; |
| + } |
| + |
| + @Override |
| + public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) { |
| + return new DirectIntersectTermsEnum(compiled, startTerm); |
| + } |
| + |
| + @Override |
| + public long size() { |
| + return terms.length; |
| + } |
| + |
| + @Override |
| + public long getSumTotalTermFreq() { |
| + return sumTotalTermFreq; |
| + } |
| + |
| + @Override |
| + public long getSumDocFreq() { |
| + return sumDocFreq; |
| + } |
| + |
| + @Override |
| + public int getDocCount() { |
| + return docCount; |
| + } |
| + |
| + @Override |
| + public Comparator<BytesRef> getComparator() { |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| + } |
| + |
| + private final class DirectTermsEnum extends TermsEnum { |
| + |
| + private final BytesRef scratch = new BytesRef(); |
| + private int termOrd; |
| + |
| + boolean canReuse(TermAndSkip[] other) { |
| + return DirectField.this.terms == other; |
| + } |
| + |
| + private BytesRef setTerm() { |
| + scratch.bytes = termBytes; |
| + scratch.offset = termOffsets[termOrd]; |
| + scratch.length = termOffsets[termOrd+1] - termOffsets[termOrd]; |
| + return scratch; |
| + } |
| + |
| + public void reset() { |
| + termOrd = -1; |
| + } |
| + |
| + public Comparator<BytesRef> getComparator() { |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| + } |
| + |
| + @Override |
| + public BytesRef next() { |
| + termOrd++; |
| + if (termOrd < terms.length) { |
| + return setTerm(); |
| + } else { |
| + return null; |
| + } |
| + } |
| + |
| + @Override |
| + public TermState termState() { |
| + OrdTermState state = new OrdTermState(); |
| + state.ord = termOrd; |
| + return state; |
| + } |
| + |
| + // If non-negative, exact match; else, -ord-1, where ord |
| + // is where you would insert the term. |
| + private int findTerm(BytesRef term) { |
| + |
| + // Just do binary search: should be (constant factor) |
| + // faster than using the skip list: |
| + int low = 0; |
| + int high = terms.length-1; |
| + |
| + while (low <= high) { |
| + int mid = (low + high) >>> 1; |
| + int cmp = compare(mid, term); |
| + if (cmp < 0) { |
| + low = mid + 1; |
| + } else if (cmp > 0) { |
| + high = mid - 1; |
| + } else { |
| + return mid; // key found |
| + } |
| + } |
| + |
| + return -(low + 1); // key not found. |
| + } |
| + |
| + @Override |
| + public SeekStatus seekCeil(BytesRef term, boolean useCache) { |
| + // TODO: we should use the skip pointers; should be |
| + // faster than bin search; we should also hold |
| + // & reuse current state so seeking forwards is |
| + // faster |
| + final int ord = findTerm(term); |
| + // if (DEBUG) { |
| + // System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord); |
| + // } |
| + if (ord >= 0) { |
| + termOrd = ord; |
| + setTerm(); |
| + return SeekStatus.FOUND; |
| + } else if (ord == -terms.length-1) { |
| + return SeekStatus.END; |
| + } else { |
| + termOrd = -ord - 1; |
| + setTerm(); |
| + return SeekStatus.NOT_FOUND; |
| + } |
| + } |
| + |
| + @Override |
| + public boolean seekExact(BytesRef term, boolean useCache) { |
| + // TODO: we should use the skip pointers; should be |
| + // faster than bin search; we should also hold |
| + // & reuse current state so seeking forwards is |
| + // faster |
| + final int ord = findTerm(term); |
| + if (ord >= 0) { |
| + termOrd = ord; |
| + setTerm(); |
| + return true; |
| + } else { |
| + return false; |
| + } |
| + } |
| + |
| + @Override |
| + public void seekExact(long ord) { |
| + termOrd = (int) ord; |
| + setTerm(); |
| + } |
| + |
| + @Override |
| + public void seekExact(BytesRef term, TermState state) throws IOException { |
| + termOrd = (int) ((OrdTermState) state).ord; |
| + setTerm(); |
| + assert term.equals(scratch); |
| + } |
| + |
| + @Override |
| + public BytesRef term() { |
| + return scratch; |
| + } |
| + |
| + @Override |
| + public long ord() { |
| + return termOrd; |
| + } |
| + |
| + @Override |
| + public int docFreq() { |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + return ((LowFreqTerm) terms[termOrd]).docFreq; |
| + } else { |
| + return ((HighFreqTerm) terms[termOrd]).docIDs.length; |
| + } |
| + } |
| + |
| + @Override |
| + public long totalTermFreq() { |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + return ((LowFreqTerm) terms[termOrd]).totalTermFreq; |
| + } else { |
| + return ((HighFreqTerm) terms[termOrd]).totalTermFreq; |
| + } |
| + } |
| + |
| + @Override |
| + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs) { |
| + if (needsFreqs && !hasFreq) { |
| + return null; |
| + } |
| + |
| + // TODO: implement reuse, something like Pulsing: |
| + // it's hairy! |
| + |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + final int[] postings = ((LowFreqTerm) terms[termOrd]).postings; |
| + if (hasFreq) { |
| + if (hasPos) { |
| + int posLen; |
| + if (hasOffsets) { |
| + posLen = 3; |
| + } else { |
| + posLen = 1; |
| + } |
| + if (hasPayloads) { |
| + posLen++; |
| + } |
| + LowFreqDocsEnum docsEnum; |
| + if (reuse instanceof LowFreqDocsEnum) { |
| + docsEnum = (LowFreqDocsEnum) reuse; |
| + if (!docsEnum.canReuse(liveDocs, posLen)) { |
| + docsEnum = new LowFreqDocsEnum(liveDocs, posLen); |
| + } |
| + } else { |
| + docsEnum = new LowFreqDocsEnum(liveDocs, posLen); |
| + } |
| + |
| + return docsEnum.reset(postings); |
| + } else { |
| + LowFreqDocsEnumNoPos docsEnum; |
| + if (reuse instanceof LowFreqDocsEnumNoPos) { |
| + docsEnum = (LowFreqDocsEnumNoPos) reuse; |
| + if (!docsEnum.canReuse(liveDocs)) { |
| + docsEnum = new LowFreqDocsEnumNoPos(liveDocs); |
| + } |
| + } else { |
| + docsEnum = new LowFreqDocsEnumNoPos(liveDocs); |
| + } |
| + |
| + return docsEnum.reset(postings); |
| + } |
| + } else { |
| + LowFreqDocsEnumNoTF docsEnum; |
| + if (reuse instanceof LowFreqDocsEnumNoTF) { |
| + docsEnum = (LowFreqDocsEnumNoTF) reuse; |
| + if (!docsEnum.canReuse(liveDocs)) { |
| + docsEnum = new LowFreqDocsEnumNoTF(liveDocs); |
| + } |
| + } else { |
| + docsEnum = new LowFreqDocsEnumNoTF(liveDocs); |
| + } |
| + |
| + return docsEnum.reset(postings); |
| + } |
| + } else { |
| + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; |
| + |
| + HighFreqDocsEnum docsEnum; |
| + if (reuse instanceof HighFreqDocsEnum) { |
| + docsEnum = (HighFreqDocsEnum) reuse; |
| + if (!docsEnum.canReuse(liveDocs)) { |
| + docsEnum = new HighFreqDocsEnum(liveDocs); |
| + } |
| + } else { |
| + docsEnum = new HighFreqDocsEnum(liveDocs); |
| + } |
| + |
| + //System.out.println(" DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs"); |
| + return docsEnum.reset(term.docIDs, term.freqs); |
| + } |
| + } |
| + |
| + @Override |
| + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { |
| + if (!hasPos) { |
| + return null; |
| + } |
| + if (needsOffsets && !hasOffsets) { |
| + return null; |
| + } |
| + |
| + // TODO: implement reuse, something like Pulsing: |
| + // it's hairy! |
| + |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]); |
| + final int[] postings = term.postings; |
| + final byte[] payloads = term.payloads; |
| + return new LowFreqDocsAndPositionsEnum(liveDocs, hasOffsets, hasPayloads).reset(postings, payloads); |
| + } else { |
| + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; |
| + return new HighFreqDocsAndPositionsEnum(liveDocs, hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads); |
| + } |
| + } |
| + } |
| + |
| + private final class DirectIntersectTermsEnum extends TermsEnum { |
| + private final RunAutomaton runAutomaton; |
| + private final CompiledAutomaton compiledAutomaton; |
| + private int termOrd; |
| + private final BytesRef scratch = new BytesRef(); |
| + |
| + private final class State { |
| + int changeOrd; |
| + int state; |
| + Transition[] transitions; |
| + int transitionUpto; |
| + int transitionMax; |
| + int transitionMin; |
| + } |
| + |
| + private State[] states; |
| + private int stateUpto; |
| + |
| + public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) { |
| + runAutomaton = compiled.runAutomaton; |
| + compiledAutomaton = compiled; |
| + termOrd = -1; |
| + states = new State[1]; |
| + states[0] = new State(); |
| + states[0].changeOrd = terms.length; |
| + states[0].state = runAutomaton.getInitialState(); |
| + states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state]; |
| + states[0].transitionUpto = -1; |
| + states[0].transitionMax = -1; |
| + |
| + //System.out.println("IE.init startTerm=" + startTerm); |
| + |
| + if (startTerm != null) { |
| + int skipUpto = 0; |
| + if (startTerm.length == 0) { |
| + if (terms.length > 0 && termOffsets[1] == 0) { |
| + termOrd = 0; |
| + } |
| + } else { |
| + termOrd++; |
| + |
| + nextLabel: |
| + for(int i=0;i<startTerm.length;i++) { |
| + final int label = startTerm.bytes[startTerm.offset+i] & 0xFF; |
| + |
| + while (label > states[i].transitionMax) { |
| + states[i].transitionUpto++; |
| + assert states[i].transitionUpto < states[i].transitions.length; |
| + states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin(); |
| + states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax(); |
| + assert states[i].transitionMin >= 0; |
| + assert states[i].transitionMin <= 255; |
| + assert states[i].transitionMax >= 0; |
| + assert states[i].transitionMax <= 255; |
| + } |
| + |
| + // Skip forwards until we find a term matching |
| + // the label at this position: |
| + while (termOrd < terms.length) { |
| + final int skipOffset = skipOffsets[termOrd]; |
| + final int numSkips = skipOffsets[termOrd+1] - skipOffset; |
| + final int termOffset = termOffsets[termOrd]; |
| + final int termLength = termOffsets[1+termOrd] - termOffset; |
| + |
| + // if (DEBUG) { |
| + // System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i); |
| + // } |
| + |
| + if (termOrd == states[stateUpto].changeOrd) { |
| + // if (DEBUG) { |
| + // System.out.println(" end push return"); |
| + // } |
| + stateUpto--; |
| + termOrd--; |
| + return; |
| + } |
| + |
| + if (termLength == i) { |
| + termOrd++; |
| + skipUpto = 0; |
| + // if (DEBUG) { |
| + // System.out.println(" term too short; next term"); |
| + // } |
| + } else if (label < (termBytes[termOffset+i] & 0xFF)) { |
| + termOrd--; |
| + // if (DEBUG) { |
| + // System.out.println(" no match; already beyond; return termOrd=" + termOrd); |
| + // } |
| + stateUpto -= skipUpto; |
| + assert stateUpto >= 0; |
| + return; |
| + } else if (label == (termBytes[termOffset+i] & 0xFF)) { |
| + // if (DEBUG) { |
| + // System.out.println(" label[" + i + "] matches"); |
| + // } |
| + if (skipUpto < numSkips) { |
| + grow(); |
| + |
| + final int nextState = runAutomaton.step(states[stateUpto].state, label); |
| + |
| + // Automaton is required to accept startTerm: |
| + assert nextState != -1; |
| + |
| + stateUpto++; |
| + states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; |
| + states[stateUpto].state = nextState; |
| + states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; |
| + states[stateUpto].transitionUpto = -1; |
| + states[stateUpto].transitionMax = -1; |
| + //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); |
| + |
| + // if (DEBUG) { |
| + // System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd); |
| + // } |
| + |
| + // Match next label at this same term: |
| + continue nextLabel; |
| + } else { |
| + // if (DEBUG) { |
| + // System.out.println(" linear scan"); |
| + // } |
| + // Index exhausted: just scan now (the |
| + // number of scans required will be less |
| + // than the minSkipCount): |
| + final int startTermOrd = termOrd; |
| + while (termOrd < terms.length && compare(termOrd, startTerm) <= 0) { |
| + assert termOrd == startTermOrd || skipOffsets[termOrd] == skipOffsets[termOrd+1]; |
| + termOrd++; |
| + } |
| + assert termOrd - startTermOrd < minSkipCount; |
| + termOrd--; |
| + stateUpto -= skipUpto; |
| + // if (DEBUG) { |
| + // System.out.println(" end termOrd=" + termOrd); |
| + // } |
| + return; |
| + } |
| + } else { |
| + if (skipUpto < numSkips) { |
| + termOrd = skips[skipOffset + skipUpto]; |
| + // if (DEBUG) { |
| + // System.out.println(" no match; skip to termOrd=" + termOrd); |
| + // } |
| + } else { |
| + // if (DEBUG) { |
| + // System.out.println(" no match; next term"); |
| + // } |
| + termOrd++; |
| + } |
| + skipUpto = 0; |
| + } |
| + } |
| + |
| + // startTerm is >= last term so enum will not |
| + // return any terms: |
| + termOrd--; |
| + // if (DEBUG) { |
| + // System.out.println(" beyond end; no terms will match"); |
| + // } |
| + return; |
| + } |
| + } |
| + |
| + final int termOffset = termOffsets[termOrd]; |
| + final int termLen = termOffsets[1+termOrd] - termOffset; |
| + |
| + if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) { |
| + stateUpto -= skipUpto; |
| + termOrd--; |
| + } |
| + // if (DEBUG) { |
| + // System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto); |
| + // } |
| + } |
| + } |
| + |
| + public Comparator<BytesRef> getComparator() { |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| + } |
| + |
| + private void grow() { |
| + if (states.length == 1+stateUpto) { |
| + final State[] newStates = new State[states.length+1]; |
| + System.arraycopy(states, 0, newStates, 0, states.length); |
| + newStates[states.length] = new State(); |
| + states = newStates; |
| + } |
| + } |
| + |
| + @Override |
| + public BytesRef next() { |
| + // if (DEBUG) { |
| + // System.out.println("\nIE.next"); |
| + // } |
| + |
| + termOrd++; |
| + int skipUpto = 0; |
| + |
| + if (termOrd == 0 && termOffsets[1] == 0) { |
| + // Special-case empty string: |
| + assert stateUpto == 0; |
| + // if (DEBUG) { |
| + // System.out.println(" visit empty string"); |
| + // } |
| + if (runAutomaton.isAccept(states[0].state)) { |
| + scratch.bytes = termBytes; |
| + scratch.offset = 0; |
| + scratch.length = 0; |
| + return scratch; |
| + } |
| + termOrd++; |
| + } |
| + |
| + nextTerm: |
| + |
| + while (true) { |
| + // if (DEBUG) { |
| + // System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + " skipUpto=" + skipUpto); |
| + // } |
| + if (termOrd == terms.length) { |
| + // if (DEBUG) { |
| + // System.out.println(" return END"); |
| + // } |
| + return null; |
| + } |
| + |
| + final State state = states[stateUpto]; |
| + if (termOrd == state.changeOrd) { |
| + // Pop: |
| + // if (DEBUG) { |
| + // System.out.println(" pop stateUpto=" + stateUpto); |
| + // } |
| + stateUpto--; |
| + /* |
| + if (DEBUG) { |
| + try { |
| + //System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)).utf8ToString()); |
| + System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length))); |
| + } catch (ArrayIndexOutOfBoundsException aioobe) { |
| + System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length))); |
| + } |
| + } |
| + */ |
| + |
| + continue; |
| + } |
| + |
| + final int termOffset = termOffsets[termOrd]; |
| + final int termLength = termOffsets[termOrd+1] - termOffset; |
| + final int skipOffset = skipOffsets[termOrd]; |
| + final int numSkips = skipOffsets[termOrd+1] - skipOffset; |
| + |
| + // if (DEBUG) { |
| + // System.out.println(" term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips)); |
| + // } |
| + |
| + assert termOrd < state.changeOrd; |
| + |
| + assert stateUpto <= termLength: "term.length=" + termLength + "; stateUpto=" + stateUpto; |
| + final int label = termBytes[termOffset+stateUpto] & 0xFF; |
| + |
| + while (label > state.transitionMax) { |
| + //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length); |
| + state.transitionUpto++; |
| + if (state.transitionUpto == state.transitions.length) { |
| + // We've exhausted transitions leaving this |
| + // state; force pop+next/skip now: |
| + //System.out.println("forcepop: stateUpto=" + stateUpto); |
| + if (stateUpto == 0) { |
| + termOrd = terms.length; |
| + return null; |
| + } else { |
| + assert state.changeOrd > termOrd; |
| + // if (DEBUG) { |
| + // System.out.println(" jumpend " + (state.changeOrd - termOrd)); |
| + // } |
| + //System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " + termOrd); |
| + termOrd = states[stateUpto].changeOrd; |
| + skipUpto = 0; |
| + stateUpto--; |
| + } |
| + continue nextTerm; |
| + } |
| + assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length; |
| + state.transitionMin = state.transitions[state.transitionUpto].getMin(); |
| + state.transitionMax = state.transitions[state.transitionUpto].getMax(); |
| + assert state.transitionMin >= 0; |
| + assert state.transitionMin <= 255; |
| + assert state.transitionMax >= 0; |
| + assert state.transitionMax <= 255; |
| + } |
| + |
| + /* |
| + if (DEBUG) { |
| + System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + (char) label + "(" + label + ") term=" + new BytesRef(terms[termOrd].term).utf8ToString() + " trans " + |
| + (char) state.transitionMin + "(" + state.transitionMin + ")" + "-" + (char) state.transitionMax + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips))); |
| + System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + Integer.toHexString(label) + "(" + label + ") term=" + new BytesRef(termBytes, termOffset, termLength) + " trans " + |
| + Integer.toHexString(state.transitionMin) + "(" + state.transitionMin + ")" + "-" + Integer.toHexString(state.transitionMax) + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips))); |
| + } |
| + */ |
| + |
| + final int targetLabel = state.transitionMin; |
| + |
| + if ((termBytes[termOffset+stateUpto] & 0xFF) < targetLabel) { |
| + // if (DEBUG) { |
| + // System.out.println(" do bin search"); |
| + // } |
| + //int startTermOrd = termOrd; |
| + int low = termOrd+1; |
| + int high = state.changeOrd-1; |
| + while (true) { |
| + if (low > high) { |
| + // Label not found |
| + termOrd = low; |
| + // if (DEBUG) { |
| + // System.out.println(" advanced by " + (termOrd - startTermOrd)); |
| + // } |
| + //System.out.println(" jump " + (termOrd - startTermOrd)); |
| + skipUpto = 0; |
| + continue nextTerm; |
| + } |
| + int mid = (low + high) >>> 1; |
| + int cmp = (termBytes[termOffsets[mid] + stateUpto] & 0xFF) - targetLabel; |
| + // if (DEBUG) { |
| + // System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low] + stateUpto] & 0xFF) + " ord=" + mid); |
| + // } |
| + if (cmp < 0) { |
| + low = mid+1; |
| + } else if (cmp > 0) { |
| + high = mid - 1; |
| + } else { |
| + // Label found; walk backwards to first |
| + // occurrence: |
| + while (mid > termOrd && (termBytes[termOffsets[mid-1] + stateUpto] & 0xFF) == targetLabel) { |
| + mid--; |
| + } |
| + termOrd = mid; |
| + // if (DEBUG) { |
| + // System.out.println(" advanced by " + (termOrd - startTermOrd)); |
| + // } |
| + //System.out.println(" jump " + (termOrd - startTermOrd)); |
| + skipUpto = 0; |
| + continue nextTerm; |
| + } |
| + } |
| + } |
| + |
| + int nextState = runAutomaton.step(states[stateUpto].state, label); |
| + |
| + if (nextState == -1) { |
| + // Skip |
| + // if (DEBUG) { |
| + // System.out.println(" automaton doesn't accept; skip"); |
| + // } |
| + if (skipUpto < numSkips) { |
| + // if (DEBUG) { |
| + // System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd)); |
| + // } |
| + termOrd = skips[skipOffset+skipUpto]; |
| + } else { |
| + termOrd++; |
| + } |
| + skipUpto = 0; |
| + } else if (skipUpto < numSkips) { |
| + // Push: |
| + // if (DEBUG) { |
| + // System.out.println(" push"); |
| + // } |
| + /* |
| + if (DEBUG) { |
| + try { |
| + //System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1).utf8ToString()); |
| + System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1)); |
| + } catch (ArrayIndexOutOfBoundsException aioobe) { |
| + System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1)); |
| + } |
| + } |
| + */ |
| + |
| + grow(); |
| + stateUpto++; |
| + states[stateUpto].state = nextState; |
| + states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; |
| + states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; |
| + states[stateUpto].transitionUpto = -1; |
| + states[stateUpto].transitionMax = -1; |
| + |
| + if (stateUpto == termLength) { |
| + // if (DEBUG) { |
| + // System.out.println(" term ends after push"); |
| + // } |
| + if (runAutomaton.isAccept(nextState)) { |
| + // if (DEBUG) { |
| + // System.out.println(" automaton accepts: return"); |
| + // } |
| + scratch.bytes = termBytes; |
| + scratch.offset = termOffsets[termOrd]; |
| + scratch.length = termOffsets[1+termOrd] - scratch.offset; |
| + // if (DEBUG) { |
| + // System.out.println(" ret " + scratch.utf8ToString()); |
| + // } |
| + return scratch; |
| + } else { |
| + // if (DEBUG) { |
| + // System.out.println(" automaton rejects: nextTerm"); |
| + // } |
| + termOrd++; |
| + skipUpto = 0; |
| + } |
| + } |
| + } else { |
| + // Run the non-indexed tail of this term: |
| + |
| + // TODO: add assert that we don't inc too many times |
| + |
| + if (compiledAutomaton.commonSuffixRef != null) { |
| + //System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString()); |
| + assert compiledAutomaton.commonSuffixRef.offset == 0; |
| + if (termLength < compiledAutomaton.commonSuffixRef.length) { |
| + termOrd++; |
| + skipUpto = 0; |
| + continue nextTerm; |
| + } |
| + int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length; |
| + for(int suffix=0;suffix<compiledAutomaton.commonSuffixRef.length;suffix++) { |
| + if (termBytes[offset + suffix] != compiledAutomaton.commonSuffixRef.bytes[suffix]) { |
| + termOrd++; |
| + skipUpto = 0; |
| + continue nextTerm; |
| + } |
| + } |
| + } |
| + |
| + int upto = stateUpto+1; |
| + while (upto < termLength) { |
| + nextState = runAutomaton.step(nextState, termBytes[termOffset+upto] & 0xFF); |
| + if (nextState == -1) { |
| + termOrd++; |
| + skipUpto = 0; |
| + // if (DEBUG) { |
| + // System.out.println(" nomatch tail; next term"); |
| + // } |
| + continue nextTerm; |
| + } |
| + upto++; |
| + } |
| + |
| + if (runAutomaton.isAccept(nextState)) { |
| + scratch.bytes = termBytes; |
| + scratch.offset = termOffsets[termOrd]; |
| + scratch.length = termOffsets[1+termOrd] - scratch.offset; |
| + // if (DEBUG) { |
| + // System.out.println(" match tail; return " + scratch.utf8ToString()); |
| + // System.out.println(" ret2 " + scratch.utf8ToString()); |
| + // } |
| + return scratch; |
| + } else { |
| + termOrd++; |
| + skipUpto = 0; |
| + // if (DEBUG) { |
| + // System.out.println(" nomatch tail; next term"); |
| + // } |
| + } |
| + } |
| + } |
| + } |
| + |
| + @Override |
| + public TermState termState() { |
| + OrdTermState state = new OrdTermState(); |
| + state.ord = termOrd; |
| + return state; |
| + } |
| + |
| + @Override |
| + public BytesRef term() { |
| + return scratch; |
| + } |
| + |
| + @Override |
| + public long ord() { |
| + return termOrd; |
| + } |
| + |
| + @Override |
| + public int docFreq() { |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + return ((LowFreqTerm) terms[termOrd]).docFreq; |
| + } else { |
| + return ((HighFreqTerm) terms[termOrd]).docIDs.length; |
| + } |
| + } |
| + |
| + @Override |
| + public long totalTermFreq() { |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + return ((LowFreqTerm) terms[termOrd]).totalTermFreq; |
| + } else { |
| + return ((HighFreqTerm) terms[termOrd]).totalTermFreq; |
| + } |
| + } |
| + |
| + @Override |
| + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs) { |
| + if (needsFreqs && !hasFreq) { |
| + return null; |
| + } |
| + |
| + // TODO: implement reuse, something like Pulsing: |
| + // it's hairy! |
| + |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + final int[] postings = ((LowFreqTerm) terms[termOrd]).postings; |
| + if (hasFreq) { |
| + if (hasPos) { |
| + int posLen; |
| + if (hasOffsets) { |
| + posLen = 3; |
| + } else { |
| + posLen = 1; |
| + } |
| + if (hasPayloads) { |
| + posLen++; |
| + } |
| + return new LowFreqDocsEnum(liveDocs, posLen).reset(postings); |
| + } else { |
| + return new LowFreqDocsEnumNoPos(liveDocs).reset(postings); |
| + } |
| + } else { |
| + return new LowFreqDocsEnumNoTF(liveDocs).reset(postings); |
| + } |
| + } else { |
| + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; |
| + // System.out.println("DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs"); |
| + return new HighFreqDocsEnum(liveDocs).reset(term.docIDs, term.freqs); |
| + } |
| + } |
| + |
| + @Override |
| + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { |
| + if (!hasPos) { |
| + return null; |
| + } |
| + if (needsOffsets && !hasOffsets) { |
| + return null; |
| + } |
| + |
| + // TODO: implement reuse, something like Pulsing: |
| + // it's hairy! |
| + |
| + if (terms[termOrd] instanceof LowFreqTerm) { |
| + final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]); |
| + final int[] postings = term.postings; |
| + final byte[] payloads = term.payloads; |
| + return new LowFreqDocsAndPositionsEnum(liveDocs, hasOffsets, hasPayloads).reset(postings, payloads); |
| + } else { |
| + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; |
| + return new HighFreqDocsAndPositionsEnum(liveDocs, hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads); |
| + } |
| + } |
| + |
| + @Override |
| + public SeekStatus seekCeil(BytesRef term, boolean useCache) { |
| + throw new UnsupportedOperationException(); |
| + } |
| + |
| + @Override |
| + public void seekExact(long ord) { |
| + throw new UnsupportedOperationException(); |
| + } |
| + } |
| + } |
| + |
| + // Docs only: |
| + private final static class LowFreqDocsEnumNoTF extends DocsEnum { |
| + private int[] postings; |
| + private final Bits liveDocs; |
| + private int upto; |
| + |
| + public LowFreqDocsEnumNoTF(Bits liveDocs) { |
| + this.liveDocs = liveDocs; |
| + } |
| + |
| + public boolean canReuse(Bits liveDocs) { |
| + return liveDocs == this.liveDocs; |
| + } |
| + |
| + public DocsEnum reset(int[] postings) { |
| + this.postings = postings; |
| + upto = -1; |
| + return this; |
| + } |
| + |
| + // TODO: can do this w/o setting members? |
| + |
| + @Override |
| + public int nextDoc() { |
| + upto++; |
| + if (liveDocs == null) { |
| + if (upto < postings.length) { |
| + return postings[upto]; |
| + } |
| + } else { |
| + while (upto < postings.length) { |
| + if (liveDocs.get(postings[upto])) { |
| + return postings[upto]; |
| + } |
| + upto++; |
| + } |
| + } |
| + return NO_MORE_DOCS; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + if (upto < 0) { |
| + return -1; |
| + } else if (upto < postings.length) { |
| + return postings[upto]; |
| + } else { |
| + return NO_MORE_DOCS; |
| + } |
| + } |
| + |
| + @Override |
| + public int freq() { |
| + assert false; |
| + return 1; |
| + } |
| + |
| + @Override |
| + public int advance(int target) { |
| + // Linear scan, but this is low-freq term so it won't |
| + // be costly: |
| + while(nextDoc() < target) { |
| + } |
| + return docID(); |
| + } |
| + } |
| + |
| + // Docs + freqs: |
| + private final static class LowFreqDocsEnumNoPos extends DocsEnum { |
| + private int[] postings; |
| + private final Bits liveDocs; |
| + private int upto; |
| + |
| + public LowFreqDocsEnumNoPos(Bits liveDocs) { |
| + this.liveDocs = liveDocs; |
| + } |
| + |
| + public boolean canReuse(Bits liveDocs) { |
| + return liveDocs == this.liveDocs; |
| + } |
| + |
| + public DocsEnum reset(int[] postings) { |
| + this.postings = postings; |
| + upto = -2; |
| + return this; |
| + } |
| + |
| + // TODO: can do this w/o setting members? |
| + @Override |
| + public int nextDoc() { |
| + upto += 2; |
| + if (liveDocs == null) { |
| + if (upto < postings.length) { |
| + return postings[upto]; |
| + } |
| + } else { |
| + while (upto < postings.length) { |
| + if (liveDocs.get(postings[upto])) { |
| + return postings[upto]; |
| + } |
| + upto += 2; |
| + } |
| + } |
| + return NO_MORE_DOCS; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + if (upto < 0) { |
| + return -1; |
| + } else if (upto < postings.length) { |
| + return postings[upto]; |
| + } else { |
| + return NO_MORE_DOCS; |
| + } |
| + } |
| + |
| + @Override |
| + public int freq() { |
| + return postings[upto+1]; |
| + } |
| + |
| + @Override |
| + public int advance(int target) { |
| + // Linear scan, but this is low-freq term so it won't |
| + // be costly: |
| + while(nextDoc() < target) { |
| + } |
| + return docID(); |
| + } |
| + } |
| + |
| + // Docs + freqs + positions/offets: |
| + private final static class LowFreqDocsEnum extends DocsEnum { |
| + private int[] postings; |
| + private final Bits liveDocs; |
| + private final int posMult; |
| + private int upto; |
| + private int freq; |
| + |
| + public LowFreqDocsEnum(Bits liveDocs, int posMult) { |
| + this.liveDocs = liveDocs; |
| + this.posMult = posMult; |
| + // if (DEBUG) { |
| + // System.out.println("LowFreqDE: posMult=" + posMult); |
| + // } |
| + } |
| + |
| + public boolean canReuse(Bits liveDocs, int posMult) { |
| + return liveDocs == this.liveDocs && posMult == this.posMult; |
| + } |
| + |
| + public DocsEnum reset(int[] postings) { |
| + this.postings = postings; |
| + upto = -2; |
| + freq = 0; |
| + return this; |
| + } |
| + |
| + // TODO: can do this w/o setting members? |
| + @Override |
| + public int nextDoc() { |
| + upto += 2 + freq*posMult; |
| + // if (DEBUG) { |
| + // System.out.println(" nextDoc freq=" + freq + " upto=" + upto + " vs " + postings.length); |
| + // } |
| + if (liveDocs == null) { |
| + if (upto < postings.length) { |
| + freq = postings[upto+1]; |
| + assert freq > 0; |
| + return postings[upto]; |
| + } |
| + } else { |
| + while (upto < postings.length) { |
| + freq = postings[upto+1]; |
| + assert freq > 0; |
| + if (liveDocs.get(postings[upto])) { |
| + return postings[upto]; |
| + } |
| + upto += 2 + freq*posMult; |
| + } |
| + } |
| + return NO_MORE_DOCS; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + // TODO: store docID member? |
| + if (upto < 0) { |
| + return -1; |
| + } else if (upto < postings.length) { |
| + return postings[upto]; |
| + } else { |
| + return NO_MORE_DOCS; |
| + } |
| + } |
| + |
| + @Override |
| + public int freq() { |
| + // TODO: can I do postings[upto+1]? |
| + return freq; |
| + } |
| + |
| + @Override |
| + public int advance(int target) { |
| + // Linear scan, but this is low-freq term so it won't |
| + // be costly: |
| + while(nextDoc() < target) { |
| + } |
| + return docID(); |
| + } |
| + } |
| + |
| + private final static class LowFreqDocsAndPositionsEnum extends DocsAndPositionsEnum { |
| + private int[] postings; |
| + private final Bits liveDocs; |
| + private final int posMult; |
| + private final boolean hasOffsets; |
| + private final boolean hasPayloads; |
| + private final BytesRef payload = new BytesRef(); |
| + private int upto; |
| + private int docID; |
| + private int freq; |
| + private int skipPositions; |
| + private int startOffset; |
| + private int endOffset; |
| + private int payloadOffset; |
| + private int payloadLength; |
| + |
| + public LowFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets, boolean hasPayloads) { |
| + this.liveDocs = liveDocs; |
| + this.hasOffsets = hasOffsets; |
| + this.hasPayloads = hasPayloads; |
| + if (hasOffsets) { |
| + if (hasPayloads) { |
| + posMult = 4; |
| + } else { |
| + posMult = 3; |
| + } |
| + } else if (hasPayloads) { |
| + posMult = 2; |
| + } else { |
| + posMult = 1; |
| + } |
| + } |
| + |
| + public DocsAndPositionsEnum reset(int[] postings, byte[] payloadBytes) { |
| + this.postings = postings; |
| + upto = 0; |
| + skipPositions = 0; |
| + startOffset = -1; |
| + endOffset = -1; |
| + docID = -1; |
| + payloadLength = 0; |
| + payload.bytes = payloadBytes; |
| + return this; |
| + } |
| + |
| + @Override |
| + public int nextDoc() { |
| + if (hasPayloads) { |
| + for(int i=0;i<skipPositions;i++) { |
| + upto++; |
| + if (hasOffsets) { |
| + upto += 2; |
| + } |
| + payloadOffset += postings[upto++]; |
| + } |
| + } else { |
| + upto += posMult * skipPositions; |
| + } |
| + |
| + if (liveDocs == null) { |
| + if (upto < postings.length) { |
| + docID = postings[upto++]; |
| + freq = postings[upto++]; |
| + skipPositions = freq; |
| + return docID; |
| + } |
| + } else { |
| + while(upto < postings.length) { |
| + docID = postings[upto++]; |
| + freq = postings[upto++]; |
| + if (liveDocs.get(docID)) { |
| + skipPositions = freq; |
| + return docID; |
| + } |
| + upto += posMult * freq; |
| + } |
| + } |
| + |
| + return docID = NO_MORE_DOCS; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + return docID; |
| + } |
| + |
| + @Override |
| + public int freq() { |
| + return freq; |
| + } |
| + |
| + @Override |
| + public int nextPosition() { |
| + assert skipPositions > 0; |
| + skipPositions--; |
| + final int pos = postings[upto++]; |
| + if (hasOffsets) { |
| + startOffset = postings[upto++]; |
| + endOffset = postings[upto++]; |
| + } |
| + if (hasPayloads) { |
| + payloadLength = postings[upto++]; |
| + payload.offset = payloadOffset; |
| + payloadOffset += payloadLength; |
| + } |
| + return pos; |
| + } |
| + |
| + @Override |
| + public int startOffset() { |
| + return startOffset; |
| + } |
| + |
| + @Override |
| + public int endOffset() { |
| + return endOffset; |
| + } |
| + |
| + @Override |
| + public int advance(int target) { |
| + // Linear scan, but this is low-freq term so it won't |
| + // be costly: |
| + while (nextDoc() < target) { |
| + } |
| + return docID; |
| + } |
| + |
| + @Override |
| + public boolean hasPayload() { |
| + return payloadLength > 0; |
| + } |
| + |
| + @Override |
| + public BytesRef getPayload() { |
| + if (payloadLength > 0) { |
| + payload.length = payloadLength; |
| + payloadLength = 0; |
| + return payload; |
| + } else { |
| + return null; |
| + } |
| + } |
| + } |
| + |
| + // Docs + freqs: |
| + public final static class HighFreqDocsEnum extends DocsEnum { |
| + private int[] docIDs; |
| + private int[] freqs; |
| + private final Bits liveDocs; |
| + private int upto; |
| + private int docID = -1; |
| + |
| + public HighFreqDocsEnum(Bits liveDocs) { |
| + this.liveDocs = liveDocs; |
| + } |
| + |
| + public boolean canReuse(Bits liveDocs) { |
| + return liveDocs == this.liveDocs; |
| + } |
| + |
| + public int[] getDocIDs() { |
| + return docIDs; |
| + } |
| + |
| + public int[] getFreqs() { |
| + return freqs; |
| + } |
| + |
| + public DocsEnum reset(int[] docIDs, int[] freqs) { |
| + this.docIDs = docIDs; |
| + this.freqs = freqs; |
| + upto = -1; |
| + return this; |
| + } |
| + |
| + @Override |
| + public int nextDoc() { |
| + upto++; |
| + if (liveDocs == null) { |
| + try { |
| + return docID = docIDs[upto]; |
| + } catch (ArrayIndexOutOfBoundsException e) { |
| + } |
| + } else { |
| + while (upto < docIDs.length) { |
| + if (liveDocs.get(docIDs[upto])) { |
| + return docID = docIDs[upto]; |
| + } |
| + upto++; |
| + } |
| + } |
| + return docID = NO_MORE_DOCS; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + return docID; |
| + } |
| + |
| + @Override |
| + public int freq() { |
| + return freqs[upto]; |
| + } |
| + |
| + @Override |
| + public int advance(int target) { |
| + /* |
| + upto++; |
| + if (upto == docIDs.length) { |
| + return docID = NO_MORE_DOCS; |
| + } |
| + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); |
| + if (index < 0) { |
| + upto = -index - 1; |
| + } else { |
| + upto = index; |
| + } |
| + if (liveDocs != null) { |
| + while (upto < docIDs.length) { |
| + if (liveDocs.get(docIDs[upto])) { |
| + break; |
| + } |
| + upto++; |
| + } |
| + } |
| + if (upto == docIDs.length) { |
| + return NO_MORE_DOCS; |
| + } else { |
| + return docID = docIDs[upto]; |
| + } |
| + */ |
| + |
| + //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length); |
| + // if (DEBUG) { |
| + // System.out.println("advance target=" + target + " len=" + docIDs.length); |
| + // } |
| + upto++; |
| + if (upto == docIDs.length) { |
| + return docID = NO_MORE_DOCS; |
| + } |
| + |
| + // First "grow" outwards, since most advances are to |
| + // nearby docs: |
| + int inc = 10; |
| + int nextUpto = upto+10; |
| + int low; |
| + int high; |
| + while (true) { |
| + //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc); |
| + if (nextUpto >= docIDs.length) { |
| + low = nextUpto-inc; |
| + high = docIDs.length-1; |
| + break; |
| + } |
| + //System.out.println(" docID=" + docIDs[nextUpto]); |
| + |
| + if (target <= docIDs[nextUpto]) { |
| + low = nextUpto-inc; |
| + high = nextUpto; |
| + break; |
| + } |
| + inc *= 2; |
| + nextUpto += inc; |
| + } |
| + |
| + // Now do normal binary search |
| + //System.out.println(" after fwd: low=" + low + " high=" + high); |
| + |
| + while (true) { |
| + |
| + if (low > high) { |
| + // Not exactly found |
| + //System.out.println(" break: no match"); |
| + upto = low; |
| + break; |
| + } |
| + |
| + int mid = (low + high) >>> 1; |
| + int cmp = docIDs[mid] - target; |
| + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); |
| + |
| + if (cmp < 0) { |
| + low = mid + 1; |
| + } else if (cmp > 0) { |
| + high = mid - 1; |
| + } else { |
| + // Found target |
| + upto = mid; |
| + //System.out.println(" break: match"); |
| + break; |
| + } |
| + } |
| + |
| + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); |
| + |
| + if (liveDocs != null) { |
| + while (upto < docIDs.length) { |
| + if (liveDocs.get(docIDs[upto])) { |
| + break; |
| + } |
| + upto++; |
| + } |
| + } |
| + if (upto == docIDs.length) { |
| + //System.out.println(" return END"); |
| + return docID = NO_MORE_DOCS; |
| + } else { |
| + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); |
| + return docID = docIDs[upto]; |
| + } |
| + } |
| + } |
| + |
| + // TODO: specialize offsets and not |
| + public final static class HighFreqDocsAndPositionsEnum extends DocsAndPositionsEnum { |
| + private int[] docIDs; |
| + private int[] freqs; |
| + private int[][] positions; |
| + private byte[][][] payloads; |
| + private final Bits liveDocs; |
| + private final boolean hasOffsets; |
| + private final int posJump; |
| + private int upto; |
| + private int docID = -1; |
| + private int posUpto; |
| + private boolean gotPayload; |
| + private int[] curPositions; |
| + |
| + public HighFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets) { |
| + this.liveDocs = liveDocs; |
| + this.hasOffsets = hasOffsets; |
| + posJump = hasOffsets ? 3 : 1; |
| + } |
| + |
| + public int[] getDocIDs() { |
| + return docIDs; |
| + } |
| + |
| + public int[][] getPositions() { |
| + return positions; |
| + } |
| + |
| + public int getPosJump() { |
| + return posJump; |
| + } |
| + |
| + public Bits getLiveDocs() { |
| + return liveDocs; |
| + } |
| + |
| + public DocsAndPositionsEnum reset(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads) { |
| + this.docIDs = docIDs; |
| + this.freqs = freqs; |
| + this.positions = positions; |
| + this.payloads = payloads; |
| + upto = -1; |
| + return this; |
| + } |
| + |
| + @Override |
| + public int nextDoc() { |
| + upto++; |
| + if (liveDocs == null) { |
| + if (upto < docIDs.length) { |
| + posUpto = -posJump; |
| + curPositions = positions[upto]; |
| + return docID = docIDs[upto]; |
| + } |
| + } else { |
| + while (upto < docIDs.length) { |
| + if (liveDocs.get(docIDs[upto])) { |
| + posUpto = -posJump; |
| + curPositions = positions[upto]; |
| + return docID = docIDs[upto]; |
| + } |
| + upto++; |
| + } |
| + } |
| + |
| + return docID = NO_MORE_DOCS; |
| + } |
| + |
| + @Override |
| + public int freq() { |
| + return freqs[upto]; |
| + } |
| + |
| + @Override |
| + public int docID() { |
| + return docID; |
| + } |
| + |
| + @Override |
| + public int nextPosition() { |
| + posUpto += posJump; |
| + gotPayload = false; |
| + return curPositions[posUpto]; |
| + } |
| + |
| + @Override |
| + public int startOffset() { |
| + if (hasOffsets) { |
| + return curPositions[posUpto+1]; |
| + } else { |
| + return -1; |
| + } |
| + } |
| + |
| + @Override |
| + public int endOffset() { |
| + if (hasOffsets) { |
| + return curPositions[posUpto+2]; |
| + } else { |
| + return -1; |
| + } |
| + } |
| + |
| + @Override |
| + public int advance(int target) { |
| + |
| + /* |
| + upto++; |
| + if (upto == docIDs.length) { |
| + return NO_MORE_DOCS; |
| + } |
| + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); |
| + if (index < 0) { |
| + upto = -index - 1; |
| + } else { |
| + upto = index; |
| + } |
| + if (liveDocs != null) { |
| + while (upto < docIDs.length) { |
| + if (liveDocs.get(docIDs[upto])) { |
| + break; |
| + } |
| + upto++; |
| + } |
| + } |
| + posUpto = hasOffsets ? -3 : -1; |
| + if (upto == docIDs.length) { |
| + return NO_MORE_DOCS; |
| + } else { |
| + return docID(); |
| + } |
| + */ |
| + |
| + //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length); |
| + // if (DEBUG) { |
| + // System.out.println("advance target=" + target + " len=" + docIDs.length); |
| + // } |
| + upto++; |
| + if (upto == docIDs.length) { |
| + return docID = NO_MORE_DOCS; |
| + } |
| + |
| + // First "grow" outwards, since most advances are to |
| + // nearby docs: |
| + int inc = 10; |
| + int nextUpto = upto+10; |
| + int low; |
| + int high; |
| + while (true) { |
| + //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc); |
| + if (nextUpto >= docIDs.length) { |
| + low = nextUpto-inc; |
| + high = docIDs.length-1; |
| + break; |
| + } |
| + //System.out.println(" docID=" + docIDs[nextUpto]); |
| + |
| + if (target <= docIDs[nextUpto]) { |
| + low = nextUpto-inc; |
| + high = nextUpto; |
| + break; |
| + } |
| + inc *= 2; |
| + nextUpto += inc; |
| + } |
| + |
| + // Now do normal binary search |
| + //System.out.println(" after fwd: low=" + low + " high=" + high); |
| + |
| + while (true) { |
| + |
| + if (low > high) { |
| + // Not exactly found |
| + //System.out.println(" break: no match"); |
| + upto = low; |
| + break; |
| + } |
| + |
| + int mid = (low + high) >>> 1; |
| + int cmp = docIDs[mid] - target; |
| + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); |
| + |
| + if (cmp < 0) { |
| + low = mid + 1; |
| + } else if (cmp > 0) { |
| + high = mid - 1; |
| + } else { |
| + // Found target |
| + upto = mid; |
| + //System.out.println(" break: match"); |
| + break; |
| + } |
| + } |
| + |
| + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); |
| + |
| + if (liveDocs != null) { |
| + while (upto < docIDs.length) { |
| + if (liveDocs.get(docIDs[upto])) { |
| + break; |
| + } |
| + upto++; |
| + } |
| + } |
| + if (upto == docIDs.length) { |
| + //System.out.println(" return END"); |
| + return docID = NO_MORE_DOCS; |
| + } else { |
| + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); |
| + posUpto = -posJump; |
| + curPositions = positions[upto]; |
| + return docID = docIDs[upto]; |
| + } |
| + } |
| + |
| + @Override |
| + public boolean hasPayload() { |
| + return !gotPayload && payloads != null && payloads[upto][posUpto/(hasOffsets ? 3 : 1)] != null; |
| + } |
| + |
| + private final BytesRef payload = new BytesRef(); |
| + |
| + @Override |
| + public BytesRef getPayload() { |
| + final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)]; |
| + payload.bytes = payloadBytes; |
| + payload.length = payloadBytes.length; |
| + payload.offset = 0; |
| + gotPayload = true; |
| + return payload; |
| + } |
| + } |
| +} |
| Index: lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat |
| =================================================================== |
| --- lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (revision 1363740) |
| +++ lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (working copy) |
| @@ -17,3 +17,4 @@ |
| org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat |
| org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat |
| org.apache.lucene.codecs.memory.MemoryPostingsFormat |
| +org.apache.lucene.codecs.memory.DirectPostingsFormat |
| Index: lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java (working copy) |
| @@ -41,7 +41,7 @@ |
| // - test pulling docs in 2nd round trip... |
| // - filter too |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestShardSearching extends ShardSearchingTestBase { |
| |
| private static class PreviousSearchState { |
| Index: lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java (working copy) |
| @@ -43,7 +43,7 @@ |
| import org.apache.lucene.util.NamedThreadFactory; |
| import org.apache.lucene.util._TestUtil; |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestSearcherManager extends ThreadedIndexingAndSearchingTestCase { |
| |
| boolean warmCalled; |
| Index: lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java (working copy) |
| @@ -41,7 +41,7 @@ |
| import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; |
| import org.apache.lucene.util.ThreadInterruptedException; |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestNRTManager extends ThreadedIndexingAndSearchingTestCase { |
| |
| private final ThreadLocal<Long> lastGens = new ThreadLocal<Long>(); |
| Index: lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java (working copy) |
| @@ -29,7 +29,7 @@ |
| import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestSearchWithThreads extends LuceneTestCase { |
| int NUM_DOCS; |
| final int NUM_SEARCH_THREADS = 5; |
| Index: lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java (working copy) |
| @@ -37,7 +37,7 @@ |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| import org.apache.lucene.util.automaton.RegExp; |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestTermsEnum extends LuceneTestCase { |
| |
| public void test() throws Exception { |
| Index: lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java (working copy) |
| @@ -34,7 +34,7 @@ |
| * Test indexes ~82M docs with 26 terms each, so you get > Integer.MAX_VALUE terms/docs pairs |
| * @lucene.experimental |
| */ |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class Test2BPostings extends LuceneTestCase { |
| |
| @Nightly |
| Index: lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java (working copy) |
| @@ -28,7 +28,7 @@ |
| // - mix in forceMerge, addIndexes |
| // - randomoly mix in non-congruent docs |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestNRTThreads extends ThreadedIndexingAndSearchingTestCase { |
| |
| @Override |
| Index: lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java (working copy) |
| @@ -41,7 +41,7 @@ |
| // |
| // java -server -Xmx8g -d64 -cp .:lib/junit-4.10.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms |
| // |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class Test2BTerms extends LuceneTestCase { |
| |
| private final static int TOKEN_LEN = 10; |
| Index: lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java (working copy) |
| @@ -132,8 +132,9 @@ |
| |
| public void testLazySkipping() throws IOException { |
| final String fieldFormat = _TestUtil.getPostingsFormat(this.field); |
| - assumeFalse("This test cannot run with Memory codec", fieldFormat.equals("Memory")); |
| - assumeFalse("This test cannot run with SimpleText codec", fieldFormat.equals("SimpleText")); |
| + assumeFalse("This test cannot run with Memory postings format", fieldFormat.equals("Memory")); |
| + assumeFalse("This test cannot run with Direct postings format", fieldFormat.equals("Direct")); |
| + assumeFalse("This test cannot run with SimpleText postings format", fieldFormat.equals("SimpleText")); |
| |
| // test whether only the minimum amount of seeks() |
| // are performed |
| Index: lucene/core/src/test/org/apache/lucene/index/TestNorms.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestNorms.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestNorms.java (working copy) |
| @@ -40,7 +40,7 @@ |
| * Test that norms info is preserved during index life - including |
| * separate norms, addDocument, addIndexes, forceMerge. |
| */ |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| @Slow |
| public class TestNorms extends LuceneTestCase { |
| final String byteTestField = "normsTestByte"; |
| Index: lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java (working copy) |
| @@ -37,7 +37,7 @@ |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util._TestUtil; |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| public class TestLongPostings extends LuceneTestCase { |
| |
| // Produces a realistic unicode random string that |
| Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java (working copy) |
| @@ -976,7 +976,7 @@ |
| // Don't proceed if picked Codec is in the list of illegal ones. |
| final String format = _TestUtil.getPostingsFormat("f"); |
| assumeFalse("Format: " + format + " does not support ReaderTermsIndexDivisor!", |
| - (format.equals("SimpleText") || format.equals("Memory"))); |
| + (format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct"))); |
| |
| Directory dir = newDirectory(); |
| IndexWriter w = new IndexWriter(dir, conf); |
| Index: lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1363740) |
| +++ lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) |
| @@ -67,7 +67,7 @@ |
| import org.apache.lucene.util.fst.PairOutputs.Pair; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| -@SuppressCodecs({ "SimpleText", "Memory" }) |
| +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) |
| @Slow |
| public class TestFSTs extends LuceneTestCase { |
| |
| Index: lucene/MIGRATE.txt |
| =================================================================== |
| --- lucene/MIGRATE.txt (revision 1363740) |
| +++ lucene/MIGRATE.txt (working copy) |
| @@ -629,3 +629,8 @@ |
| instance exposing the inverted index of the one document. From |
| Fields you can enumerate all fields, terms, positions, offsets. |
| |
| +* LUCENE-4227: If you were previously using Instantiated index, you |
| + may want to use DirectPostingsFormat after upgrading: it stores all |
| + postings in simple arrrays (byte[] for terms, int[] for docs, freqs, |
| + positions, offsets). Note that this only covers postings, whereas |
| + Instantiated covered all other parts of the index as well. |
| Index: lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (revision 1363740) |
| +++ lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (working copy) |
| @@ -32,6 +32,7 @@ |
| import org.apache.lucene.codecs.lucene40.Lucene40Codec; |
| import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; |
| import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds; |
| +import org.apache.lucene.codecs.memory.DirectPostingsFormat; |
| import org.apache.lucene.codecs.memory.MemoryPostingsFormat; |
| import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat; |
| import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat; |
| @@ -87,9 +88,11 @@ |
| // block via CL: |
| int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); |
| int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100); |
| + int lowFreqCutoff = _TestUtil.nextInt(random, 2, 100); |
| |
| add(avoidCodecs, |
| new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock), |
| + new DirectPostingsFormat(minItemsPerBlock, lowFreqCutoff), |
| new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), |
| // add pulsing again with (usually) different parameters |
| new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), |