| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.lucene80; |
| |
| import java.io.DataInput; |
| import java.io.IOException; |
| |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.store.RandomAccessInput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BitSetIterator; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.RoaringDocIdSet; |
| |
| /** |
| * Disk-based implementation of a {@link DocIdSetIterator} which can return |
| * the index of the current document, i.e. the ordinal of the current document |
| * among the list of documents that this iterator can return. This is useful |
| * to implement sparse doc values by only having to encode values for documents |
| * that actually have a value. |
| * <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of |
| * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536} |
| * documents independently and picks between 3 encodings depending on the |
| * density of the range:<ul> |
| * <li>{@code ALL} if the range contains 65536 documents exactly, |
| * <li>{@code DENSE} if the range contains 4096 documents or more; in that |
| * case documents are stored in a bit set, |
| * <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are |
| * stored in a {@link DataInput#readShort() short}. |
| * </ul> |
| * <p>Only ranges that contain at least one value are encoded. |
| * <p>This implementation uses 6 bytes per document in the worst-case, which happens |
| * in the case that all ranges contain exactly one document. |
| * |
| * |
| * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup |
| * tables are used: A lookup table for block offset and index, and a rank structure |
| * for DENSE block index lookups. |
| * |
| * The lookup table is an array of {@code int}-pairs, with a pair for each block. It allows for |
| * direct jumping to the block, as opposed to iteration from the current position and forward |
| * one block at a time. |
| * |
| * Each int-pair entry consists of 2 logical parts: |
| * |
| * The first 32 bit int holds the index (number of set bits in the blocks) up to just before the |
| * wanted block. The maximum number of set bits is the maximum number of documents, which is < 2^31. |
| * |
| * The next int holds the offset in bytes into the underlying slice. As there is a maximum of 2^16 |
| * blocks, it follows that the maximum size of any block must not exceed 2^15 bytes to avoid |
| * overflow (2^16 bytes if the int is treated as unsigned). This is currently the case, with the |
| * largest block being DENSE and using 2^13 + 36 bytes. |
| * |
| * The cache overhead is numDocs/1024 bytes. |
| * |
| * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits). |
| * In the case of non-existing blocks, the entry in the lookup table has index equal to the |
| * previous entry and offset equal to the next non-empty block. |
| * |
| * The block lookup table is stored at the end of the total block structure. |
| * |
| * |
| * The rank structure for DENSE blocks is an array of byte-pairs with an entry for each |
| * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE block. |
| * |
| * Each rank-entry states the number of set bits within the block up to the bit before the |
| * bit positioned at the start of the sub-block. |
| * Note that that the rank entry of the first sub-block is always 0 and that the last entry can |
| * at most be 65536-2 = 65634 and thus will always fit into an byte-pair of 16 bits. |
| * |
| * The rank structure for a given DENSE block is stored at the beginning of the DENSE block. |
| * This ensures locality and keeps logistics simple. |
| * |
| * @lucene.internal |
| */ |
| final class IndexedDISI extends DocIdSetIterator { |
| |
| // jump-table time/space trade-offs to consider: |
| // The block offsets and the block indexes could be stored in more compressed form with |
| // two PackedInts or two MonotonicDirectReaders. |
| // The DENSE ranks (default 128 shorts = 256 bytes) could likewise be compressed. But as there is |
| // at least 4096 set bits in DENSE blocks, there will be at least one rank with 2^12 bits, so it |
| // is doubtful if there is much to gain here. |
| |
| private static final int BLOCK_SIZE = 65536; // The number of docIDs that a single block represents |
| |
| private static final int DENSE_BLOCK_LONGS = BLOCK_SIZE/Long.SIZE; // 1024 |
| public static final byte DEFAULT_DENSE_RANK_POWER = 9; // Every 512 docIDs / 8 longs |
| |
| static final int MAX_ARRAY_LENGTH = (1 << 12) - 1; |
| |
| private static void flush( |
| int block, FixedBitSet buffer, int cardinality, byte denseRankPower, IndexOutput out) throws IOException { |
| assert block >= 0 && block < 65536; |
| out.writeShort((short) block); |
| assert cardinality > 0 && cardinality <= 65536; |
| out.writeShort((short) (cardinality - 1)); |
| if (cardinality > MAX_ARRAY_LENGTH) { |
| if (cardinality != 65536) { // all docs are set |
| if (denseRankPower != -1) { |
| final byte[] rank = createRank(buffer, denseRankPower); |
| out.writeBytes(rank, rank.length); |
| } |
| for (long word : buffer.getBits()) { |
| out.writeLong(word); |
| } |
| } |
| } else { |
| BitSetIterator it = new BitSetIterator(buffer, cardinality); |
| for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { |
| out.writeShort((short) doc); |
| } |
| } |
| } |
| |
| // Creates a DENSE rank-entry (the number of set bits up to a given point) for the buffer. |
| // One rank-entry for every {@code 2^denseRankPower} bits, with each rank-entry using 2 bytes. |
| // Represented as a byte[] for fast flushing and mirroring of the retrieval representation. |
| private static byte[] createRank(FixedBitSet buffer, byte denseRankPower) { |
| final int longsPerRank = 1 << (denseRankPower-6); |
| final int rankMark = longsPerRank-1; |
| final int rankIndexShift = denseRankPower-7; // 6 for the long (2^6) + 1 for 2 bytes/entry |
| final byte[] rank = new byte[DENSE_BLOCK_LONGS >> rankIndexShift]; |
| final long[] bits = buffer.getBits(); |
| int bitCount = 0; |
| for (int word = 0 ; word < DENSE_BLOCK_LONGS ; word++) { |
| if ((word & rankMark) == 0) { // Every longsPerRank longs |
| rank[word >> rankIndexShift] = (byte)(bitCount>>8); |
| rank[(word >> rankIndexShift)+1] = (byte)(bitCount & 0xFF); |
| } |
| bitCount += Long.bitCount(bits[word]); |
| } |
| return rank; |
| } |
| |
| /** |
| * Writes the docIDs from it to out, in logical blocks, one for each 65536 docIDs in monotonically increasing |
| * gap-less order. DENSE blocks uses {@link #DEFAULT_DENSE_RANK_POWER} of 9 (every 512 docIDs / 8 longs). |
| * The caller must keep track of the number of jump-table entries (returned by this method) as well as the |
| * denseRankPower (9 for this method) and provide them when constructing an IndexedDISI for reading. |
| * @param it the document IDs. |
| * @param out destination for the blocks. |
| * @throws IOException if there was an error writing to out. |
| * @return the number of jump-table entries following the blocks, -1 for no entries. |
| * This should be stored in meta and used when creating an instance of IndexedDISI. |
| */ |
| static short writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException { |
| return writeBitSet(it, out, DEFAULT_DENSE_RANK_POWER); |
| } |
| |
| /** |
| * Writes the docIDs from it to out, in logical blocks, one for each 65536 docIDs in monotonically |
| * increasing gap-less order. |
| * The caller must keep track of the number of jump-table entries (returned by this method) as well as the |
| * denseRankPower and provide them when constructing an IndexedDISI for reading. |
| * @param it the document IDs. |
| * @param out destination for the blocks. |
| * @param denseRankPower for {@link Method#DENSE} blocks, a rank will be written every {@code 2^denseRankPower} docIDs. |
| * Values < 7 (every 128 docIDs) or > 15 (every 32768 docIDs) disables DENSE rank. |
| * Recommended values are 8-12: Every 256-4096 docIDs or 4-64 longs. |
| * {@link #DEFAULT_DENSE_RANK_POWER} is 9: Every 512 docIDs. |
| * This should be stored in meta and used when creating an instance of IndexedDISI. |
| * @throws IOException if there was an error writing to out. |
| * @return the number of jump-table entries following the blocks, -1 for no entries. |
| * This should be stored in meta and used when creating an instance of IndexedDISI. |
| */ |
| static short writeBitSet(DocIdSetIterator it, IndexOutput out, byte denseRankPower) throws IOException { |
| final long origo = out.getFilePointer(); // All jumps are relative to the origo |
| if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) { |
| throw new IllegalArgumentException("Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). " + |
| "The provided power was " + denseRankPower + " (every " + (int)Math.pow(2, denseRankPower) + " docIDs)"); |
| } |
| int totalCardinality = 0; |
| int blockCardinality = 0; |
| final FixedBitSet buffer = new FixedBitSet(1<<16); |
| int[] jumps = new int[ArrayUtil.oversize(1, Integer.BYTES*2)]; |
| int prevBlock = -1; |
| int jumpBlockIndex = 0; |
| |
| for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { |
| final int block = doc >>> 16; |
| if (prevBlock != -1 && block != prevBlock) { |
| // Track offset+index from previous block up to current |
| jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, jumpBlockIndex, prevBlock+1); |
| jumpBlockIndex = prevBlock+1; |
| // Flush block |
| flush(prevBlock, buffer, blockCardinality, denseRankPower, out); |
| // Reset for next block |
| buffer.clear(0, buffer.length()); |
| totalCardinality += blockCardinality; |
| blockCardinality = 0; |
| } |
| buffer.set(doc & 0xFFFF); |
| blockCardinality++; |
| prevBlock = block; |
| } |
| if (blockCardinality > 0) { |
| jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, jumpBlockIndex, prevBlock+1); |
| totalCardinality += blockCardinality; |
| flush(prevBlock, buffer, blockCardinality, denseRankPower, out); |
| buffer.clear(0, buffer.length()); |
| prevBlock++; |
| } |
| final int lastBlock = prevBlock == -1 ? 0 : prevBlock; // There will always be at least 1 block (NO_MORE_DOCS) |
| // Last entry is a SPARSE with blockIndex == 32767 and the single entry 65535, which becomes the docID NO_MORE_DOCS |
| // To avoid creating 65K jump-table entries, only a single entry is created pointing to the offset of the |
| // NO_MORE_DOCS block, with the jumpBlockIndex set to the logical EMPTY block after all real blocks. |
| jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, lastBlock, lastBlock+1); |
| buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF); |
| flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, denseRankPower, out); |
| // offset+index jump-table stored at the end |
| return flushBlockJumps(jumps, lastBlock+1, out, origo); |
| } |
| |
| // Adds entries to the offset & index jump-table for blocks |
| private static int[] addJumps(int[] jumps, long offset, int index, int startBlock, int endBlock) { |
| assert offset < Integer.MAX_VALUE : "Logically the offset should not exceed 2^30 but was >= Integer.MAX_VALUE"; |
| jumps = ArrayUtil.grow(jumps, (endBlock+1)*2); |
| for (int b = startBlock; b < endBlock; b++) { |
| jumps[b*2] = index; |
| jumps[b*2+1] = (int) offset; |
| } |
| return jumps; |
| } |
| |
| // Flushes the offset & index jump-table for blocks. This should be the last data written to out |
| // This method returns the blockCount for the blocks reachable for the jump_table or -1 for no jump-table |
| private static short flushBlockJumps(int[] jumps, int blockCount, IndexOutput out, long origo) throws IOException { |
| if (blockCount == 2) { // Jumps with a single real entry + NO_MORE_DOCS is just wasted space so we ignore that |
| blockCount = 0; |
| } |
| for (int i = 0 ; i < blockCount ; i++) { |
| out.writeInt(jumps[i*2]); // index |
| out.writeInt(jumps[i*2+1]); // offset |
| } |
| // As there are at most 32k blocks, the count is a short |
| // The jumpTableOffset will be at lastPos - (blockCount * Long.BYTES) |
| return (short)blockCount; |
| } |
| |
| // Members are pkg-private to avoid synthetic accessors when accessed from the `Method` enum |
| |
| /** The slice that stores the {@link DocIdSetIterator}. */ |
| final IndexInput slice; |
| final int jumpTableEntryCount; |
| final byte denseRankPower; |
| final RandomAccessInput jumpTable; // Skip blocks of 64K bits |
| final byte[] denseRankTable; |
| final long cost; |
| |
| /** |
| * This constructor always creates a new blockSlice and a new jumpTable from in, to ensure that operations are |
| * independent from the caller. |
| * See {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)} for re-use of blockSlice and jumpTable. |
| * @param in backing data. |
| * @param offset starting offset for blocks in the backing data. |
| * @param length the number of bytes holding blocks and jump-table in the backing data. |
| * @param jumpTableEntryCount the number of blocks covered by the jump-table. |
| * This must match the number returned by {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}. |
| * @param denseRankPower the number of docIDs covered by each rank entry in DENSE blocks, expressed as {@code 2^denseRankPower}. |
| * This must match the power given in {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)} |
| * @param cost normally the number of logical docIDs. |
| */ |
| IndexedDISI(IndexInput in, long offset, long length, int jumpTableEntryCount, byte denseRankPower, long cost) throws IOException { |
| this(createBlockSlice(in,"docs", offset, length, jumpTableEntryCount), |
| createJumpTable(in, offset, length, jumpTableEntryCount), |
| jumpTableEntryCount, denseRankPower, cost); |
| } |
| |
| /** |
| * This constructor allows to pass the slice and jumpTable directly in case it helps reuse. |
| * see eg. Lucene80 norms producer's merge instance. |
| * @param blockSlice data blocks, normally created by {@link #createBlockSlice}. |
| * @param jumpTable table holding jump-data for block-skips, normally created by {@link #createJumpTable}. |
| * @param jumpTableEntryCount the number of blocks covered by the jump-table. |
| * This must match the number returned by {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}. |
| * @param denseRankPower the number of docIDs covered by each rank entry in DENSE blocks, expressed as {@code 2^denseRankPower}. |
| * This must match the power given in {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)} |
| * @param cost normally the number of logical docIDs. |
| */ |
| IndexedDISI(IndexInput blockSlice, RandomAccessInput jumpTable, int jumpTableEntryCount, byte denseRankPower, long cost) throws IOException { |
| if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) { |
| throw new IllegalArgumentException("Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). " + |
| "The provided power was " + denseRankPower + " (every " + (int)Math.pow(2, denseRankPower) + " docIDs). "); |
| } |
| |
| this.slice = blockSlice; |
| this.jumpTable = jumpTable; |
| this.jumpTableEntryCount = jumpTableEntryCount; |
| this.denseRankPower = denseRankPower; |
| final int rankIndexShift = denseRankPower-7; |
| this.denseRankTable = denseRankPower == -1 ? null : new byte[DENSE_BLOCK_LONGS >> rankIndexShift]; |
| this.cost = cost; |
| } |
| |
| /** |
| * Helper method for using {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)}. |
| * Creates a disiSlice for the IndexedDISI data blocks, without the jump-table. |
| * @param slice backing data, holding both blocks and jump-table. |
| * @param sliceDescription human readable slice designation. |
| * @param offset relative to the backing data. |
| * @param length full length of the IndexedDISI, including blocks and jump-table data. |
| * @param jumpTableEntryCount the number of blocks covered by the jump-table. |
| * @return a jumpTable containing the block jump-data or null if no such table exists. |
| * @throws IOException if a RandomAccessInput could not be created from slice. |
| */ |
| public static IndexInput createBlockSlice( |
| IndexInput slice, String sliceDescription, long offset, long length, int jumpTableEntryCount) throws IOException { |
| long jumpTableBytes = jumpTableEntryCount < 0 ? 0 : jumpTableEntryCount*Integer.BYTES*2; |
| return slice.slice(sliceDescription, offset, length - jumpTableBytes); |
| } |
| |
| /** |
| * Helper method for using {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)}. |
| * Creates a RandomAccessInput covering only the jump-table data or null. |
| * @param slice backing data, holding both blocks and jump-table. |
| * @param offset relative to the backing data. |
| * @param length full length of the IndexedDISI, including blocks and jump-table data. |
| * @param jumpTableEntryCount the number of blocks covered by the jump-table. |
| * @return a jumpTable containing the block jump-data or null if no such table exists. |
| * @throws IOException if a RandomAccessInput could not be created from slice. |
| */ |
| public static RandomAccessInput createJumpTable( |
| IndexInput slice, long offset, long length, int jumpTableEntryCount) throws IOException { |
| if (jumpTableEntryCount <= 0) { |
| return null; |
| } else { |
| int jumpTableBytes = jumpTableEntryCount*Integer.BYTES*2; |
| return slice.randomAccessSlice(offset + length - jumpTableBytes, jumpTableBytes); |
| } |
| } |
| |
| int block = -1; |
| long blockEnd; |
| long denseBitmapOffset = -1; // Only used for DENSE blocks |
| int nextBlockIndex = -1; |
| Method method; |
| |
| int doc = -1; |
| int index = -1; |
| |
| // SPARSE variables |
| boolean exists; |
| |
| // DENSE variables |
| long word; |
| int wordIndex = -1; |
| // number of one bits encountered so far, including those of `word` |
| int numberOfOnes; |
| // Used with rank for jumps inside of DENSE as they are absolute instead of relative |
| int denseOrigoIndex; |
| |
| // ALL variables |
| int gap; |
| |
| @Override |
| public int docID() { |
| return doc; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| final int targetBlock = target & 0xFFFF0000; |
| if (block < targetBlock) { |
| advanceBlock(targetBlock); |
| } |
| if (block == targetBlock) { |
| if (method.advanceWithinBlock(this, target)) { |
| return doc; |
| } |
| readBlockHeader(); |
| } |
| boolean found = method.advanceWithinBlock(this, block); |
| assert found; |
| return doc; |
| } |
| |
| public boolean advanceExact(int target) throws IOException { |
| final int targetBlock = target & 0xFFFF0000; |
| if (block < targetBlock) { |
| advanceBlock(targetBlock); |
| } |
| boolean found = block == targetBlock && method.advanceExactWithinBlock(this, target); |
| this.doc = target; |
| return found; |
| } |
| |
| private void advanceBlock(int targetBlock) throws IOException { |
| final int blockIndex = targetBlock >> 16; |
| // If the destination block is 2 blocks or more ahead, we use the jump-table. |
| if (jumpTable != null && blockIndex >= (block >> 16)+2) { |
| // If the jumpTableEntryCount is exceeded, there are no further bits. Last entry is always NO_MORE_DOCS |
| final int inRangeBlockIndex = blockIndex < jumpTableEntryCount ? blockIndex : jumpTableEntryCount-1; |
| final int index = jumpTable.readInt(inRangeBlockIndex*Integer.BYTES*2); |
| final int offset = jumpTable.readInt(inRangeBlockIndex*Integer.BYTES*2+Integer.BYTES); |
| this.nextBlockIndex = index-1; // -1 to compensate for the always-added 1 in readBlockHeader |
| slice.seek(offset); |
| readBlockHeader(); |
| return; |
| } |
| |
| // Fallback to iteration of blocks |
| do { |
| slice.seek(blockEnd); |
| readBlockHeader(); |
| } while (block < targetBlock); |
| } |
| |
| private void readBlockHeader() throws IOException { |
| block = Short.toUnsignedInt(slice.readShort()) << 16; |
| assert block >= 0; |
| final int numValues = 1 + Short.toUnsignedInt(slice.readShort()); |
| index = nextBlockIndex; |
| nextBlockIndex = index + numValues; |
| if (numValues <= MAX_ARRAY_LENGTH) { |
| method = Method.SPARSE; |
| blockEnd = slice.getFilePointer() + (numValues << 1); |
| } else if (numValues == 65536) { |
| method = Method.ALL; |
| blockEnd = slice.getFilePointer(); |
| gap = block - index - 1; |
| } else { |
| method = Method.DENSE; |
| denseBitmapOffset = slice.getFilePointer() + (denseRankTable == null ? 0 : denseRankTable.length); |
| blockEnd = denseBitmapOffset + (1 << 13); |
| // Performance consideration: All rank (default 128 * 16 bits) are loaded up front. This should be fast with the |
| // reusable byte[] buffer, but it is still wasted if the DENSE block is iterated in small steps. |
| // If this results in too great a performance regression, a heuristic strategy might work where the rank data |
| // are loaded on first in-block advance, if said advance is > X docIDs. The hope being that a small first |
| // advance means that subsequent advances will be small too. |
| // Another alternative is to maintain an extra slice for DENSE rank, but IndexedDISI is already slice-heavy. |
| if (denseRankPower != -1) { |
| slice.readBytes(denseRankTable, 0, denseRankTable.length); |
| } |
| wordIndex = -1; |
| numberOfOnes = index + 1; |
| denseOrigoIndex = numberOfOnes; |
| } |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| return advance(doc + 1); |
| } |
| |
| public int index() { |
| return index; |
| } |
| |
| @Override |
| public long cost() { |
| return cost; |
| } |
| |
| enum Method { |
| SPARSE { |
| @Override |
| boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException { |
| final int targetInBlock = target & 0xFFFF; |
| // TODO: binary search |
| for (; disi.index < disi.nextBlockIndex;) { |
| int doc = Short.toUnsignedInt(disi.slice.readShort()); |
| disi.index++; |
| if (doc >= targetInBlock) { |
| disi.doc = disi.block | doc; |
| disi.exists = true; |
| return true; |
| } |
| } |
| return false; |
| } |
| @Override |
| boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException { |
| final int targetInBlock = target & 0xFFFF; |
| // TODO: binary search |
| if (target == disi.doc) { |
| return disi.exists; |
| } |
| for (; disi.index < disi.nextBlockIndex;) { |
| int doc = Short.toUnsignedInt(disi.slice.readShort()); |
| disi.index++; |
| if (doc >= targetInBlock) { |
| if (doc != targetInBlock) { |
| disi.index--; |
| disi.slice.seek(disi.slice.getFilePointer() - Short.BYTES); |
| break; |
| } |
| disi.exists = true; |
| return true; |
| } |
| } |
| disi.exists = false; |
| return false; |
| } |
| }, |
| DENSE { |
| @Override |
| boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException { |
| final int targetInBlock = target & 0xFFFF; |
| final int targetWordIndex = targetInBlock >>> 6; |
| |
| // If possible, skip ahead using the rank cache |
| // If the distance between the current position and the target is < rank-longs |
| // there is no sense in using rank |
| if (disi.denseRankPower != -1 && targetWordIndex - disi.wordIndex >= (1 << (disi.denseRankPower-6) )) { |
| rankSkip(disi, targetInBlock); |
| } |
| |
| for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) { |
| disi.word = disi.slice.readLong(); |
| disi.numberOfOnes += Long.bitCount(disi.word); |
| } |
| disi.wordIndex = targetWordIndex; |
| |
| long leftBits = disi.word >>> target; |
| if (leftBits != 0L) { |
| disi.doc = target + Long.numberOfTrailingZeros(leftBits); |
| disi.index = disi.numberOfOnes - Long.bitCount(leftBits); |
| return true; |
| } |
| |
| // There were no set bits at the wanted position. Move forward until one is reached |
| while (++disi.wordIndex < 1024) { |
| // This could use the rank cache to skip empty spaces >= 512 bits, but it seems unrealistic |
| // that such blocks would be DENSE |
| disi.word = disi.slice.readLong(); |
| if (disi.word != 0) { |
| disi.index = disi.numberOfOnes; |
| disi.numberOfOnes += Long.bitCount(disi.word); |
| disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word); |
| return true; |
| } |
| } |
| // No set bits in the block at or after the wanted position. |
| return false; |
| } |
| |
| @Override |
| boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException { |
| final int targetInBlock = target & 0xFFFF; |
| final int targetWordIndex = targetInBlock >>> 6; |
| |
| // If possible, skip ahead using the rank cache |
| // If the distance between the current position and the target is < rank-longs |
| // there is no sense in using rank |
| if (disi.denseRankPower != -1 && targetWordIndex - disi.wordIndex >= (1 << (disi.denseRankPower-6) )) { |
| rankSkip(disi, targetInBlock); |
| } |
| |
| for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) { |
| disi.word = disi.slice.readLong(); |
| disi.numberOfOnes += Long.bitCount(disi.word); |
| } |
| disi.wordIndex = targetWordIndex; |
| |
| long leftBits = disi.word >>> target; |
| disi.index = disi.numberOfOnes - Long.bitCount(leftBits); |
| return (leftBits & 1L) != 0; |
| } |
| |
| |
| }, |
| ALL { |
| @Override |
| boolean advanceWithinBlock(IndexedDISI disi, int target) { |
| disi.doc = target; |
| disi.index = target - disi.gap; |
| return true; |
| } |
| @Override |
| boolean advanceExactWithinBlock(IndexedDISI disi, int target) { |
| disi.index = target - disi.gap; |
| return true; |
| } |
| }; |
| |
| /** Advance to the first doc from the block that is equal to or greater than {@code target}. |
| * Return true if there is such a doc and false otherwise. */ |
| abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException; |
| |
| /** Advance the iterator exactly to the position corresponding to the given {@code target} |
| * and return whether this document exists. */ |
| abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException; |
| } |
| |
| /** |
| * If the distance between the current position and the target is > 8 words, the rank cache will |
| * be used to guarantee a worst-case of 1 rank-lookup and 7 word-read-and-count-bits operations. |
| * Note: This does not guarantee a skip up to target, only up to nearest rank boundary. It is the |
| * responsibility of the caller to iterate further to reach target. |
| * @param disi standard DISI. |
| * @param targetInBlock lower 16 bits of the target |
| * @throws IOException if a DISI seek failed. |
| */ |
| private static void rankSkip(IndexedDISI disi, int targetInBlock) throws IOException { |
| assert disi.denseRankPower >= 0 : disi.denseRankPower; |
| // Resolve the rank as close to targetInBlock as possible (maximum distance is 8 longs) |
| // Note: rankOrigoOffset is tracked on block open, so it is absolute (e.g. don't add origo) |
| final int rankIndex = targetInBlock >> disi.denseRankPower; // Default is 9 (8 longs: 2^3 * 2^6 = 512 docIDs) |
| |
| final int rank = |
| (disi.denseRankTable[rankIndex<<1] & 0xFF) << 8 | |
| (disi.denseRankTable[(rankIndex<<1)+1] & 0xFF); |
| |
| // Position the counting logic just after the rank point |
| final int rankAlignedWordIndex = rankIndex << disi.denseRankPower >> 6; |
| disi.slice.seek(disi.denseBitmapOffset + rankAlignedWordIndex*Long.BYTES); |
| long rankWord = disi.slice.readLong(); |
| int denseNOO = rank + Long.bitCount(rankWord); |
| |
| disi.wordIndex = rankAlignedWordIndex; |
| disi.word = rankWord; |
| disi.numberOfOnes = disi.denseOrigoIndex + denseNOO; |
| } |
| } |