| package org.apache.lucene.util; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Comparator; |
| |
| import org.apache.lucene.search.DocIdSet; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * {@link DocIdSet} implementation based on word-aligned hybrid encoding on |
| * words of 8 bits. |
| * <p>This implementation doesn't support random-access but has a fast |
| * {@link DocIdSetIterator} which can advance in logarithmic time thanks to |
| * an index.</p> |
| * <p>The compression scheme is simplistic and should work well with sparse and |
| * very dense doc id sets while being only slightly larger than a |
| * {@link FixedBitSet} for incompressible sets (overhead<2% in the worst |
| * case) in spite of the index.</p> |
| * <p><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean, |
| * meaning composed only of zeros or ones, or dirty, meaning that it contains |
| * between 1 and 7 bits set. The idea is to encode sequences of clean words |
| * using run-length encoding and to leave sequences of dirty words as-is.</p> |
| * <table> |
| * <tr><th>Token</th><th>Clean length+</th><th>Dirty length+</th><th>Dirty words</th></tr> |
| * <tr><td>1 byte</td><td>0-n bytes</td><td>0-n bytes</td><td>0-n bytes</td></tr> |
| * </table> |
| * <ul> |
| * <li><b>Token</b> encodes whether clean means full of zeros or ones in the |
| * first bit, the number of clean words minus 2 on the next 3 bits and the |
| * number of dirty words on the last 4 bits. The higher-order bit is a |
| * continuation bit, meaning that the number is incomplete and needs additional |
| * bytes to be read.</li> |
| * <li><b>Clean length+</b>: If clean length has its higher-order bit set, |
| * you need to read a {@link DataInput#readVInt() vint}, shift it by 3 bits on |
| * the left side and add it to the 3 bits which have been read in the token.</li> |
| * <li><b>Dirty length+</b> works the same way as <b>Clean length+</b> but |
| * on 4 bits and for the length of dirty words.</li> |
| * <li><b>Dirty words</b> are the dirty words, there are <b>Dirty length</b> |
| * of them.</li> |
| * </ul> |
| * <p>This format cannot encode sequences of less than 2 clean words and 0 dirty |
| * word. The reason is that if you find a single clean word, you should rather |
| * encode it as a dirty word. This takes the same space as starting a new |
| * sequence (since you need one byte for the token) but will be lighter to |
| * decode. There is however an exception for the first sequence. Since the first |
| * sequence may start directly with a dirty word, the clean length is encoded |
| * directly, without subtracting 2.</p> |
| * <p>There is an additional restriction on the format: the sequence of dirty |
| * words is not allowed to contain two consecutive clean words. This restriction |
| * exists to make sure no space is wasted and to make sure iterators can read |
| * the next doc ID by reading at most 2 dirty words.</p> |
| * @lucene.experimental |
| */ |
| public final class WAH8DocIdSet extends DocIdSet { |
| |
| // Minimum index interval, intervals below this value can't guarantee anymore |
| // that this set implementation won't be significantly larger than a FixedBitSet |
| // The reason is that a single sequence saves at least one byte and an index |
| // entry requires at most 8 bytes (2 ints) so there shouldn't be more than one |
| // index entry every 8 sequences |
| private static final int MIN_INDEX_INTERVAL = 8; |
| |
| /** Default index interval. */ |
| public static final int DEFAULT_INDEX_INTERVAL = 24; |
| |
| private static final MonotonicAppendingLongBuffer SINGLE_ZERO_BUFFER = new MonotonicAppendingLongBuffer(1, 64, PackedInts.COMPACT); |
| private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], 0, 1, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); |
| |
| static { |
| SINGLE_ZERO_BUFFER.add(0L); |
| SINGLE_ZERO_BUFFER.freeze(); |
| } |
| |
| private static final Comparator<Iterator> SERIALIZED_LENGTH_COMPARATOR = new Comparator<Iterator>() { |
| @Override |
| public int compare(Iterator wi1, Iterator wi2) { |
| return wi1.in.length() - wi2.in.length(); |
| } |
| }; |
| |
| /** Same as {@link #intersect(Collection, int)} with the default index interval. */ |
| public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets) { |
| return intersect(docIdSets, DEFAULT_INDEX_INTERVAL); |
| } |
| |
| /** |
| * Compute the intersection of the provided sets. This method is much faster than |
| * computing the intersection manually since it operates directly at the byte level. |
| */ |
| public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets, int indexInterval) { |
| switch (docIdSets.size()) { |
| case 0: |
| throw new IllegalArgumentException("There must be at least one set to intersect"); |
| case 1: |
| return docIdSets.iterator().next(); |
| } |
| // The logic below is similar to ConjunctionScorer |
| final int numSets = docIdSets.size(); |
| final Iterator[] iterators = new Iterator[numSets]; |
| int i = 0; |
| for (WAH8DocIdSet set : docIdSets) { |
| final Iterator it = set.iterator(); |
| iterators[i++] = it; |
| } |
| Arrays.sort(iterators, SERIALIZED_LENGTH_COMPARATOR); |
| final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval); |
| int wordNum = 0; |
| main: |
| while (true) { |
| // Advance the least costly iterator first |
| iterators[0].advanceWord(wordNum); |
| wordNum = iterators[0].wordNum; |
| if (wordNum == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| byte word = iterators[0].word; |
| for (i = 1; i < numSets; ++i) { |
| if (iterators[i].wordNum < wordNum) { |
| iterators[i].advanceWord(wordNum); |
| } |
| if (iterators[i].wordNum > wordNum) { |
| wordNum = iterators[i].wordNum; |
| continue main; |
| } |
| assert iterators[i].wordNum == wordNum; |
| word &= iterators[i].word; |
| if (word == 0) { |
| // There are common words, but they don't share any bit |
| ++wordNum; |
| continue main; |
| } |
| } |
| // Found a common word |
| assert word != 0; |
| builder.addWord(wordNum, word); |
| ++wordNum; |
| } |
| return builder.build(); |
| } |
| |
| /** Same as {@link #union(Collection, int)} with the default index interval. */ |
| public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets) { |
| return union(docIdSets, DEFAULT_INDEX_INTERVAL); |
| } |
| |
| /** |
| * Compute the union of the provided sets. This method is much faster than |
| * computing the union manually since it operates directly at the byte level. |
| */ |
| public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets, int indexInterval) { |
| switch (docIdSets.size()) { |
| case 0: |
| return EMPTY; |
| case 1: |
| return docIdSets.iterator().next(); |
| } |
| // The logic below is very similar to DisjunctionScorer |
| final int numSets = docIdSets.size(); |
| final PriorityQueue<Iterator> iterators = new PriorityQueue<WAH8DocIdSet.Iterator>(numSets) { |
| @Override |
| protected boolean lessThan(Iterator a, Iterator b) { |
| return a.wordNum < b.wordNum; |
| } |
| }; |
| for (WAH8DocIdSet set : docIdSets) { |
| Iterator iterator = set.iterator(); |
| iterator.nextWord(); |
| iterators.add(iterator); |
| } |
| |
| Iterator top = iterators.top(); |
| if (top.wordNum == Integer.MAX_VALUE) { |
| return EMPTY; |
| } |
| int wordNum = top.wordNum; |
| byte word = top.word; |
| final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval); |
| while (true) { |
| top.nextWord(); |
| iterators.updateTop(); |
| top = iterators.top(); |
| if (top.wordNum == wordNum) { |
| word |= top.word; |
| } else { |
| builder.addWord(wordNum, word); |
| if (top.wordNum == Integer.MAX_VALUE) { |
| break; |
| } |
| wordNum = top.wordNum; |
| word = top.word; |
| } |
| } |
| return builder.build(); |
| } |
| |
| static int wordNum(int docID) { |
| assert docID >= 0; |
| return docID >>> 3; |
| } |
| |
| /** Word-based builder. */ |
| static class WordBuilder { |
| |
| final GrowableByteArrayDataOutput out; |
| final GrowableByteArrayDataOutput dirtyWords; |
| int clean; |
| int lastWordNum; |
| int numSequences; |
| int indexInterval; |
| int cardinality; |
| boolean reverse; |
| |
| WordBuilder() { |
| out = new GrowableByteArrayDataOutput(1024); |
| dirtyWords = new GrowableByteArrayDataOutput(128); |
| clean = 0; |
| lastWordNum = -1; |
| numSequences = 0; |
| indexInterval = DEFAULT_INDEX_INTERVAL; |
| cardinality = 0; |
| } |
| |
| /** Set the index interval. Smaller index intervals improve performance of |
| * {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet} |
| * larger. An index interval <code>i</code> makes the index add an overhead |
| * which is at most <code>4/i</code>, but likely much less.The default index |
| * interval is <code>8</code>, meaning the index has an overhead of at most |
| * 50%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an |
| * index interval. */ |
| public WordBuilder setIndexInterval(int indexInterval) { |
| if (indexInterval < MIN_INDEX_INTERVAL) { |
| throw new IllegalArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL); |
| } |
| this.indexInterval = indexInterval; |
| return this; |
| } |
| |
| void writeHeader(boolean reverse, int cleanLength, int dirtyLength) throws IOException { |
| final int cleanLengthMinus2 = cleanLength - 2; |
| assert cleanLengthMinus2 >= 0; |
| assert dirtyLength >= 0; |
| int token = ((cleanLengthMinus2 & 0x03) << 4) | (dirtyLength & 0x07); |
| if (reverse) { |
| token |= 1 << 7; |
| } |
| if (cleanLengthMinus2 > 0x03) { |
| token |= 1 << 6; |
| } |
| if (dirtyLength > 0x07) { |
| token |= 1 << 3; |
| } |
| out.writeByte((byte) token); |
| if (cleanLengthMinus2 > 0x03) { |
| out.writeVInt(cleanLengthMinus2 >>> 2); |
| } |
| if (dirtyLength > 0x07) { |
| out.writeVInt(dirtyLength >>> 3); |
| } |
| } |
| |
| private boolean sequenceIsConsistent() { |
| for (int i = 1; i < dirtyWords.length; ++i) { |
| assert dirtyWords.bytes[i-1] != 0 || dirtyWords.bytes[i] != 0; |
| assert dirtyWords.bytes[i-1] != (byte) 0xFF || dirtyWords.bytes[i] != (byte) 0xFF; |
| } |
| return true; |
| } |
| |
| void writeSequence() { |
| assert sequenceIsConsistent(); |
| try { |
| writeHeader(reverse, clean, dirtyWords.length); |
| } catch (IOException cannotHappen) { |
| throw new AssertionError(cannotHappen); |
| } |
| out.writeBytes(dirtyWords.bytes, 0, dirtyWords.length); |
| dirtyWords.length = 0; |
| ++numSequences; |
| } |
| |
| void addWord(int wordNum, byte word) { |
| assert wordNum > lastWordNum; |
| assert word != 0; |
| |
| if (!reverse) { |
| if (lastWordNum == -1) { |
| clean = 2 + wordNum; // special case for the 1st sequence |
| dirtyWords.writeByte(word); |
| } else { |
| switch (wordNum - lastWordNum) { |
| case 1: |
| if (word == (byte) 0xFF && dirtyWords.bytes[dirtyWords.length-1] == (byte) 0xFF) { |
| --dirtyWords.length; |
| writeSequence(); |
| reverse = true; |
| clean = 2; |
| } else { |
| dirtyWords.writeByte(word); |
| } |
| break; |
| case 2: |
| dirtyWords.writeByte((byte) 0); |
| dirtyWords.writeByte(word); |
| break; |
| default: |
| writeSequence(); |
| clean = wordNum - lastWordNum - 1; |
| dirtyWords.writeByte(word); |
| } |
| } |
| } else { |
| assert lastWordNum >= 0; |
| switch (wordNum - lastWordNum) { |
| case 1: |
| if (word == (byte) 0xFF) { |
| if (dirtyWords.length == 0) { |
| ++clean; |
| } else if (dirtyWords.bytes[dirtyWords.length - 1] == (byte) 0xFF) { |
| --dirtyWords.length; |
| writeSequence(); |
| clean = 2; |
| } else { |
| dirtyWords.writeByte(word); |
| } |
| } else { |
| dirtyWords.writeByte(word); |
| } |
| break; |
| case 2: |
| dirtyWords.writeByte((byte) 0); |
| dirtyWords.writeByte(word); |
| break; |
| default: |
| writeSequence(); |
| reverse = false; |
| clean = wordNum - lastWordNum - 1; |
| dirtyWords.writeByte(word); |
| } |
| } |
| lastWordNum = wordNum; |
| cardinality += BitUtil.bitCount(word); |
| } |
| |
| /** Build a new {@link WAH8DocIdSet}. */ |
| public WAH8DocIdSet build() { |
| if (cardinality == 0) { |
| assert lastWordNum == -1; |
| return EMPTY; |
| } |
| writeSequence(); |
| final byte[] data = Arrays.copyOf(out.bytes, out.length); |
| |
| // Now build the index |
| final int valueCount = (numSequences - 1) / indexInterval + 1; |
| final MonotonicAppendingLongBuffer indexPositions, indexWordNums; |
| if (valueCount <= 1) { |
| indexPositions = indexWordNums = SINGLE_ZERO_BUFFER; |
| } else { |
| final int pageSize = 128; |
| final int initialPageCount = (valueCount + pageSize - 1) / pageSize; |
| final MonotonicAppendingLongBuffer positions = new MonotonicAppendingLongBuffer(initialPageCount, pageSize, PackedInts.COMPACT); |
| final MonotonicAppendingLongBuffer wordNums = new MonotonicAppendingLongBuffer(initialPageCount, pageSize, PackedInts.COMPACT); |
| |
| positions.add(0L); |
| wordNums.add(0L); |
| final Iterator it = new Iterator(data, cardinality, Integer.MAX_VALUE, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); |
| assert it.in.getPosition() == 0; |
| assert it.wordNum == -1; |
| for (int i = 1; i < valueCount; ++i) { |
| // skip indexInterval sequences |
| for (int j = 0; j < indexInterval; ++j) { |
| final boolean readSequence = it.readSequence(); |
| assert readSequence; |
| it.skipDirtyBytes(); |
| } |
| final int position = it.in.getPosition(); |
| final int wordNum = it.wordNum; |
| positions.add(position); |
| wordNums.add(wordNum + 1); |
| } |
| positions.freeze(); |
| wordNums.freeze(); |
| indexPositions = positions; |
| indexWordNums = wordNums; |
| } |
| |
| return new WAH8DocIdSet(data, cardinality, indexInterval, indexPositions, indexWordNums); |
| } |
| |
| } |
| |
| /** A builder for {@link WAH8DocIdSet}s. */ |
| public static final class Builder extends WordBuilder { |
| |
| private int lastDocID; |
| private int wordNum, word; |
| |
| /** Sole constructor */ |
| public Builder() { |
| super(); |
| lastDocID = -1; |
| wordNum = -1; |
| word = 0; |
| } |
| |
| /** Add a document to this builder. Documents must be added in order. */ |
| public Builder add(int docID) { |
| if (docID <= lastDocID) { |
| throw new IllegalArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID); |
| } |
| final int wordNum = wordNum(docID); |
| if (this.wordNum == -1) { |
| this.wordNum = wordNum; |
| word = 1 << (docID & 0x07); |
| } else if (wordNum == this.wordNum) { |
| word |= 1 << (docID & 0x07); |
| } else { |
| addWord(this.wordNum, (byte) word); |
| this.wordNum = wordNum; |
| word = 1 << (docID & 0x07); |
| } |
| lastDocID = docID; |
| return this; |
| } |
| |
| /** Add the content of the provided {@link DocIdSetIterator}. */ |
| public Builder add(DocIdSetIterator disi) throws IOException { |
| for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) { |
| add(doc); |
| } |
| return this; |
| } |
| |
| @Override |
| public Builder setIndexInterval(int indexInterval) { |
| return (Builder) super.setIndexInterval(indexInterval); |
| } |
| |
| @Override |
| public WAH8DocIdSet build() { |
| if (this.wordNum != -1) { |
| addWord(wordNum, (byte) word); |
| } |
| return super.build(); |
| } |
| |
| } |
| |
| // where the doc IDs are stored |
| private final byte[] data; |
| private final int cardinality; |
| private final int indexInterval; |
| // index for advance(int) |
| private final MonotonicAppendingLongBuffer positions, wordNums; // wordNums[i] starts at the sequence at positions[i] |
| |
| WAH8DocIdSet(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer positions, MonotonicAppendingLongBuffer wordNums) { |
| this.data = data; |
| this.cardinality = cardinality; |
| this.indexInterval = indexInterval; |
| this.positions = positions; |
| this.wordNums = wordNums; |
| } |
| |
| @Override |
| public boolean isCacheable() { |
| return true; |
| } |
| |
| @Override |
| public Iterator iterator() { |
| return new Iterator(data, cardinality, indexInterval, positions, wordNums); |
| } |
| |
| static int readCleanLength(ByteArrayDataInput in, int token) { |
| int len = (token >>> 4) & 0x07; |
| final int startPosition = in.getPosition(); |
| if ((len & 0x04) != 0) { |
| len = (len & 0x03) | (in.readVInt() << 2); |
| } |
| if (startPosition != 1) { |
| len += 2; |
| } |
| return len; |
| } |
| |
| static int readDirtyLength(ByteArrayDataInput in, int token) { |
| int len = token & 0x0F; |
| if ((len & 0x08) != 0) { |
| len = (len & 0x07) | (in.readVInt() << 3); |
| } |
| return len; |
| } |
| |
| static class Iterator extends DocIdSetIterator { |
| |
| /* Using the index can be costly for close targets. */ |
| static int indexThreshold(int cardinality, int indexInterval) { |
| // Short sequences encode for 3 words (2 clean words and 1 dirty byte), |
| // don't advance if we are going to read less than 3 x indexInterval |
| // sequences |
| long indexThreshold = 3L * 3 * indexInterval; |
| return (int) Math.min(Integer.MAX_VALUE, indexThreshold); |
| } |
| |
| final ByteArrayDataInput in; |
| final int cardinality; |
| final int indexInterval; |
| final MonotonicAppendingLongBuffer positions, wordNums; |
| final int indexThreshold; |
| int allOnesLength; |
| int dirtyLength; |
| |
| int wordNum; // byte offset |
| byte word; // current word |
| int bitList; // list of bits set in the current word |
| int sequenceNum; // in which sequence are we? |
| |
| int docID; |
| |
| Iterator(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer positions, MonotonicAppendingLongBuffer wordNums) { |
| this.in = new ByteArrayDataInput(data); |
| this.cardinality = cardinality; |
| this.indexInterval = indexInterval; |
| this.positions = positions; |
| this.wordNums = wordNums; |
| wordNum = -1; |
| word = 0; |
| bitList = 0; |
| sequenceNum = -1; |
| docID = -1; |
| indexThreshold = indexThreshold(cardinality, indexInterval); |
| } |
| |
| boolean readSequence() { |
| if (in.eof()) { |
| wordNum = Integer.MAX_VALUE; |
| return false; |
| } |
| final int token = in.readByte() & 0xFF; |
| if ((token & (1 << 7)) == 0) { |
| final int cleanLength = readCleanLength(in, token); |
| wordNum += cleanLength; |
| } else { |
| allOnesLength = readCleanLength(in, token); |
| } |
| dirtyLength = readDirtyLength(in, token); |
| assert in.length() - in.getPosition() >= dirtyLength : in.getPosition() + " " + in.length() + " " + dirtyLength; |
| ++sequenceNum; |
| return true; |
| } |
| |
| void skipDirtyBytes(int count) { |
| assert count >= 0; |
| assert count <= allOnesLength + dirtyLength; |
| wordNum += count; |
| if (count <= allOnesLength) { |
| allOnesLength -= count; |
| } else { |
| count -= allOnesLength; |
| allOnesLength = 0; |
| in.skipBytes(count); |
| dirtyLength -= count; |
| } |
| } |
| |
| void skipDirtyBytes() { |
| wordNum += allOnesLength + dirtyLength; |
| in.skipBytes(dirtyLength); |
| allOnesLength = 0; |
| dirtyLength = 0; |
| } |
| |
| void nextWord() { |
| if (allOnesLength > 0) { |
| word = (byte) 0xFF; |
| ++wordNum; |
| --allOnesLength; |
| return; |
| } |
| if (dirtyLength > 0) { |
| word = in.readByte(); |
| ++wordNum; |
| --dirtyLength; |
| if (word != 0) { |
| return; |
| } |
| if (dirtyLength > 0) { |
| word = in.readByte(); |
| ++wordNum; |
| --dirtyLength; |
| assert word != 0; // never more than one consecutive 0 |
| return; |
| } |
| } |
| if (readSequence()) { |
| nextWord(); |
| } |
| } |
| |
| int forwardBinarySearch(int targetWordNum) { |
| // advance forward and double the window at each step |
| final int indexSize = (int) wordNums.size(); |
| int lo = sequenceNum / indexInterval, hi = lo + 1; |
| assert sequenceNum == -1 || wordNums.get(lo) <= wordNum; |
| assert lo + 1 == wordNums.size() || wordNums.get(lo + 1) > wordNum; |
| while (true) { |
| if (hi >= indexSize) { |
| hi = indexSize - 1; |
| break; |
| } else if (wordNums.get(hi) >= targetWordNum) { |
| break; |
| } |
| final int newLo = hi; |
| hi += (hi - lo) << 1; |
| lo = newLo; |
| } |
| |
| // we found a window containing our target, let's binary search now |
| while (lo <= hi) { |
| final int mid = (lo + hi) >>> 1; |
| final int midWordNum = (int) wordNums.get(mid); |
| if (midWordNum <= targetWordNum) { |
| lo = mid + 1; |
| } else { |
| hi = mid - 1; |
| } |
| } |
| assert wordNums.get(hi) <= targetWordNum; |
| assert hi+1 == wordNums.size() || wordNums.get(hi + 1) > targetWordNum; |
| return hi; |
| } |
| |
| void advanceWord(int targetWordNum) { |
| assert targetWordNum > wordNum; |
| int delta = targetWordNum - wordNum; |
| if (delta <= allOnesLength + dirtyLength + 1) { |
| skipDirtyBytes(delta - 1); |
| } else { |
| skipDirtyBytes(); |
| assert dirtyLength == 0; |
| if (delta > indexThreshold) { |
| // use the index |
| final int i = forwardBinarySearch(targetWordNum); |
| final int position = (int) positions.get(i); |
| if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move |
| wordNum = (int) wordNums.get(i) - 1; |
| in.setPosition(position); |
| sequenceNum = i * indexInterval - 1; |
| } |
| } |
| |
| while (true) { |
| if (!readSequence()) { |
| return; |
| } |
| delta = targetWordNum - wordNum; |
| if (delta <= allOnesLength + dirtyLength + 1) { |
| if (delta > 1) { |
| skipDirtyBytes(delta - 1); |
| } |
| break; |
| } |
| skipDirtyBytes(); |
| } |
| } |
| |
| nextWord(); |
| } |
| |
| @Override |
| public int docID() { |
| return docID; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| if (bitList != 0) { // there are remaining bits in the current word |
| docID = (wordNum << 3) | ((bitList & 0x0F) - 1); |
| bitList >>>= 4; |
| return docID; |
| } |
| nextWord(); |
| if (wordNum == Integer.MAX_VALUE) { |
| return docID = NO_MORE_DOCS; |
| } |
| bitList = BitUtil.bitList(word); |
| assert bitList != 0; |
| docID = (wordNum << 3) | ((bitList & 0x0F) - 1); |
| bitList >>>= 4; |
| return docID; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| assert target > docID; |
| final int targetWordNum = wordNum(target); |
| if (targetWordNum > wordNum) { |
| advanceWord(targetWordNum); |
| bitList = BitUtil.bitList(word); |
| } |
| return slowAdvance(target); |
| } |
| |
| @Override |
| public long cost() { |
| return cardinality; |
| } |
| |
| } |
| |
| /** Return the number of documents in this {@link DocIdSet} in constant time. */ |
| public int cardinality() { |
| return cardinality; |
| } |
| |
| /** Return the memory usage of this class in bytes. */ |
| public long ramBytesUsed() { |
| return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2 * RamUsageEstimator.NUM_BYTES_INT) |
| + RamUsageEstimator.sizeOf(data) |
| + positions.ramBytesUsed() |
| + wordNums.ramBytesUsed(); |
| } |
| |
| } |