| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util; |
| |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.search.DocIdSetIterator; |
| |
| /** |
| * A bit set that only stores longs that have at least one bit which is set. |
| * The way it works is that the space of bits is divided into blocks of |
| * 4096 bits, which is 64 longs. Then for each block, we have:<ul> |
| * <li>a long[] which stores the non-zero longs for that block</li> |
| * <li>a long so that bit <tt>i</tt> being set means that the <code>i-th</code> |
| * long of the block is non-null, and its offset in the array of longs is |
| * the number of one bits on the right of the <code>i-th</code> bit.</li></ul> |
| * |
| * @lucene.internal |
| */ |
| public class SparseFixedBitSet extends BitSet implements Bits, Accountable { |
| |
| private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SparseFixedBitSet.class); |
| private static final long SINGLE_ELEMENT_ARRAY_BYTES_USED = RamUsageEstimator.sizeOf(new long[1]); |
| private static final int MASK_4096 = (1 << 12) - 1; |
| |
| private static int blockCount(int length) { |
| int blockCount = length >>> 12; |
| if ((blockCount << 12) < length) { |
| ++blockCount; |
| } |
| assert (blockCount << 12) >= length; |
| return blockCount; |
| } |
| |
| final long[] indices; |
| final long[][] bits; |
| final int length; |
| int nonZeroLongCount; |
| long ramBytesUsed; |
| |
| /** Create a {@link SparseFixedBitSet} that can contain bits between |
| * <code>0</code> included and <code>length</code> excluded. */ |
| public SparseFixedBitSet(int length) { |
| if (length < 1) { |
| throw new IllegalArgumentException("length needs to be >= 1"); |
| } |
| this.length = length; |
| final int blockCount = blockCount(length); |
| indices = new long[blockCount]; |
| bits = new long[blockCount][]; |
| ramBytesUsed = BASE_RAM_BYTES_USED |
| + RamUsageEstimator.shallowSizeOf(indices) |
| + RamUsageEstimator.shallowSizeOf(bits); |
| } |
| |
| @Override |
| public int length() { |
| return length; |
| } |
| |
| private boolean consistent(int index) { |
| assert index >= 0 && index < length : "index=" + index + ",length=" + length; |
| return true; |
| } |
| |
| @Override |
| public int cardinality() { |
| int cardinality = 0; |
| for (long[] bitArray : bits) { |
| if (bitArray != null) { |
| for (long bits : bitArray) { |
| cardinality += Long.bitCount(bits); |
| } |
| } |
| } |
| return cardinality; |
| } |
| |
| @Override |
| public int approximateCardinality() { |
| // we are assuming that bits are uniformly set and use the linear counting |
| // algorithm to estimate the number of bits that are set based on the number |
| // of longs that are different from zero |
| final int totalLongs = (length + 63) >>> 6; // total number of longs in the space |
| assert totalLongs >= nonZeroLongCount; |
| final int zeroLongs = totalLongs - nonZeroLongCount; // number of longs that are zeros |
| // No need to guard against division by zero, it will return +Infinity and things will work as expected |
| final long estimate = Math.round(totalLongs * Math.log((double) totalLongs / zeroLongs)); |
| return (int) Math.min(length, estimate); |
| } |
| |
| @Override |
| public boolean get(int i) { |
| assert consistent(i); |
| final int i4096 = i >>> 12; |
| final long index = indices[i4096]; |
| final int i64 = i >>> 6; |
| // first check the index, if the i64-th bit is not set, then i is not set |
| // note: this relies on the fact that shifts are mod 64 in java |
| if ((index & (1L << i64)) == 0) { |
| return false; |
| } |
| |
| // if it is set, then we count the number of bits that are set on the right |
| // of i64, and that gives us the index of the long that stores the bits we |
| // are interested in |
| final long bits = this.bits[i4096][Long.bitCount(index & ((1L << i64) - 1))]; |
| return (bits & (1L << i)) != 0; |
| } |
| |
| private static int oversize(int s) { |
| int newSize = s + (s >>> 1); |
| if (newSize > 50) { |
| newSize = 64; |
| } |
| return newSize; |
| } |
| |
| /** |
| * Set the bit at index <tt>i</tt>. |
| */ |
| public void set(int i) { |
| assert consistent(i); |
| final int i4096 = i >>> 12; |
| final long index = indices[i4096]; |
| final int i64 = i >>> 6; |
| if ((index & (1L << i64)) != 0) { |
| // in that case the sub 64-bits block we are interested in already exists, |
| // we just need to set a bit in an existing long: the number of ones on |
| // the right of i64 gives us the index of the long we need to update |
| bits[i4096][Long.bitCount(index & ((1L << i64) - 1))] |= 1L << i; // shifts are mod 64 in java |
| } else if (index == 0) { |
| // if the index is 0, it means that we just found a block of 4096 bits |
| // that has no bit that is set yet. So let's initialize a new block: |
| insertBlock(i4096, i64, i); |
| } else { |
| // in that case we found a block of 4096 bits that has some values, but |
| // the sub-block of 64 bits that we are interested in has no value yet, |
| // so we need to insert a new long |
| insertLong(i4096, i64, i, index); |
| } |
| } |
| |
| private void insertBlock(int i4096, int i64, int i) { |
| indices[i4096] = 1L << i64; // shifts are mod 64 in java |
| assert bits[i4096] == null; |
| bits[i4096] = new long[] { 1L << i }; // shifts are mod 64 in java |
| ++nonZeroLongCount; |
| ramBytesUsed += SINGLE_ELEMENT_ARRAY_BYTES_USED; |
| } |
| |
| private void insertLong(int i4096, int i64, int i, long index) { |
| indices[i4096] |= 1L << i64; // shifts are mod 64 in java |
| // we count the number of bits that are set on the right of i64 |
| // this gives us the index at which to perform the insertion |
| final int o = Long.bitCount(index & ((1L << i64) - 1)); |
| final long[] bitArray = bits[i4096]; |
| if (bitArray[bitArray.length - 1] == 0) { |
| // since we only store non-zero longs, if the last value is 0, it means |
| // that we alreay have extra space, make use of it |
| System.arraycopy(bitArray, o, bitArray, o + 1, bitArray.length - o - 1); |
| bitArray[o] = 1L << i; |
| } else { |
| // we don't have extra space so we need to resize to insert the new long |
| final int newSize = oversize(bitArray.length + 1); |
| final long[] newBitArray = new long[newSize]; |
| System.arraycopy(bitArray, 0, newBitArray, 0, o); |
| newBitArray[o] = 1L << i; |
| System.arraycopy(bitArray, o, newBitArray, o + 1, bitArray.length - o); |
| bits[i4096] = newBitArray; |
| ramBytesUsed += RamUsageEstimator.sizeOf(newBitArray) - RamUsageEstimator.sizeOf(bitArray); |
| } |
| ++nonZeroLongCount; |
| } |
| |
| /** |
| * Clear the bit at index <tt>i</tt>. |
| */ |
| public void clear(int i) { |
| assert consistent(i); |
| final int i4096 = i >>> 12; |
| final int i64 = i >>> 6; |
| and(i4096, i64, ~(1L << i)); |
| } |
| |
| private void and(int i4096, int i64, long mask) { |
| final long index = indices[i4096]; |
| if ((index & (1L << i64)) != 0) { |
| // offset of the long bits we are interested in in the array |
| final int o = Long.bitCount(index & ((1L << i64) - 1)); |
| long bits = this.bits[i4096][o] & mask; |
| if (bits == 0) { |
| removeLong(i4096, i64, index, o); |
| } else { |
| this.bits[i4096][o] = bits; |
| } |
| } |
| } |
| |
| private void removeLong(int i4096, int i64, long index, int o) { |
| index &= ~(1L << i64); |
| indices[i4096] = index; |
| if (index == 0) { |
| // release memory, there is nothing in this block anymore |
| this.bits[i4096] = null; |
| } else { |
| final int length = Long.bitCount(index); |
| final long[] bitArray = bits[i4096]; |
| System.arraycopy(bitArray, o + 1, bitArray, o, length - o); |
| bitArray[length] = 0L; |
| } |
| nonZeroLongCount -= 1; |
| } |
| |
| @Override |
| public void clear(int from, int to) { |
| assert from >= 0; |
| assert to <= length; |
| if (from >= to) { |
| return; |
| } |
| final int firstBlock = from >>> 12; |
| final int lastBlock = (to - 1) >>> 12; |
| if (firstBlock == lastBlock) { |
| clearWithinBlock(firstBlock, from & MASK_4096, (to - 1) & MASK_4096); |
| } else { |
| clearWithinBlock(firstBlock, from & MASK_4096, MASK_4096); |
| for (int i = firstBlock + 1; i < lastBlock; ++i) { |
| nonZeroLongCount -= Long.bitCount(indices[i]); |
| indices[i] = 0; |
| bits[i] = null; |
| } |
| clearWithinBlock(lastBlock, 0, (to - 1) & MASK_4096); |
| } |
| } |
| |
| // create a long that has bits set to one between from and to |
| private static long mask(int from, int to) { |
| return ((1L << (to - from) << 1) - 1) << from; |
| } |
| |
| private void clearWithinBlock(int i4096, int from, int to) { |
| int firstLong = from >>> 6; |
| int lastLong = to >>> 6; |
| |
| if (firstLong == lastLong) { |
| and(i4096, firstLong, ~mask(from, to)); |
| } else { |
| assert firstLong < lastLong; |
| and(i4096, lastLong, ~mask(0, to)); |
| for (int i = lastLong - 1; i >= firstLong + 1; --i) { |
| and(i4096, i, 0L); |
| } |
| and(i4096, firstLong, ~mask(from, 63)); |
| } |
| } |
| |
| /** Return the first document that occurs on or after the provided block index. */ |
| private int firstDoc(int i4096) { |
| long index = 0; |
| while (i4096 < indices.length) { |
| index = indices[i4096]; |
| if (index != 0) { |
| final int i64 = Long.numberOfTrailingZeros(index); |
| return (i4096 << 12) | (i64 << 6) | Long.numberOfTrailingZeros(bits[i4096][0]); |
| } |
| i4096 += 1; |
| } |
| return DocIdSetIterator.NO_MORE_DOCS; |
| } |
| |
| @Override |
| public int nextSetBit(int i) { |
| assert i < length; |
| final int i4096 = i >>> 12; |
| final long index = indices[i4096]; |
| final long[] bitArray = this.bits[i4096]; |
| int i64 = i >>> 6; |
| int o = Long.bitCount(index & ((1L << i64) - 1)); |
| if ((index & (1L << i64)) != 0) { |
| // There is at least one bit that is set in the current long, check if |
| // one of them is after i |
| final long bits = bitArray[o] >>> i; // shifts are mod 64 |
| if (bits != 0) { |
| return i + Long.numberOfTrailingZeros(bits); |
| } |
| o += 1; |
| } |
| final long indexBits = index >>> i64 >>> 1; |
| if (indexBits == 0) { |
| // no more bits are set in the current block of 4096 bits, go to the next one |
| return firstDoc(i4096 + 1); |
| } |
| // there are still set bits |
| i64 += 1 + Long.numberOfTrailingZeros(indexBits); |
| final long bits = bitArray[o]; |
| return (i64 << 6) | Long.numberOfTrailingZeros(bits); |
| } |
| |
| /** Return the last document that occurs on or before the provided block index. */ |
| private int lastDoc(int i4096) { |
| long index; |
| while (i4096 >= 0) { |
| index = indices[i4096]; |
| if (index != 0) { |
| final int i64 = 63 - Long.numberOfLeadingZeros(index); |
| final long bits = this.bits[i4096][Long.bitCount(index) - 1]; |
| return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits)); |
| } |
| i4096 -= 1; |
| } |
| return -1; |
| } |
| |
| @Override |
| public int prevSetBit(int i) { |
| assert i >= 0; |
| final int i4096 = i >>> 12; |
| final long index = indices[i4096]; |
| final long[] bitArray = this.bits[i4096]; |
| int i64 = i >>> 6; |
| final long indexBits = index & ((1L << i64) - 1); |
| final int o = Long.bitCount(indexBits); |
| if ((index & (1L << i64)) != 0) { |
| // There is at least one bit that is set in the same long, check if there |
| // is one bit that is set that is lower than i |
| final long bits = bitArray[o] & ((1L << i << 1) - 1); |
| if (bits != 0) { |
| return (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits)); |
| } |
| } |
| if (indexBits == 0) { |
| // no more bits are set in this block, go find the last bit in the |
| // previous block |
| return lastDoc(i4096 - 1); |
| } |
| // go to the previous long |
| i64 = 63 - Long.numberOfLeadingZeros(indexBits); |
| final long bits = bitArray[o - 1]; |
| return (i4096 << 12) | (i64 << 6) | (63 - Long.numberOfLeadingZeros(bits)); |
| } |
| |
| /** Return the long bits at the given <code>i64</code> index. */ |
| private long longBits(long index, long[] bits, int i64) { |
| if ((index & (1L << i64)) == 0) { |
| return 0L; |
| } else { |
| return bits[Long.bitCount(index & ((1L << i64) - 1))]; |
| } |
| } |
| |
| private void or(final int i4096, final long index, long[] bits, int nonZeroLongCount) { |
| assert Long.bitCount(index) == nonZeroLongCount; |
| final long currentIndex = indices[i4096]; |
| if (currentIndex == 0) { |
| // fast path: if we currently have nothing in the block, just copy the data |
| // this especially happens all the time if you call OR on an empty set |
| indices[i4096] = index; |
| this.bits[i4096] = ArrayUtil.copyOfSubArray(bits, 0, nonZeroLongCount); |
| this.nonZeroLongCount += nonZeroLongCount; |
| return; |
| } |
| final long[] currentBits = this.bits[i4096]; |
| final long[] newBits; |
| final long newIndex = currentIndex | index; |
| final int requiredCapacity = Long.bitCount(newIndex); |
| if (currentBits.length >= requiredCapacity) { |
| newBits = currentBits; |
| } else { |
| newBits = new long[oversize(requiredCapacity)]; |
| } |
| // we iterate backwards in order to not override data we might need on the next iteration if the |
| // array is reused |
| for (int i = Long.numberOfLeadingZeros(newIndex), newO = Long.bitCount(newIndex) - 1; |
| i < 64; |
| i += 1 + Long.numberOfLeadingZeros(newIndex << (i + 1)), newO -= 1) { |
| // bitIndex is the index of a bit which is set in newIndex and newO is the number of 1 bits on its right |
| final int bitIndex = 63 - i; |
| assert newO == Long.bitCount(newIndex & ((1L << bitIndex) - 1)); |
| newBits[newO] = longBits(currentIndex, currentBits, bitIndex) | longBits(index, bits, bitIndex); |
| } |
| indices[i4096] = newIndex; |
| this.bits[i4096] = newBits; |
| this.nonZeroLongCount += nonZeroLongCount - Long.bitCount(currentIndex & index); |
| } |
| |
| private void or(SparseFixedBitSet other) { |
| for (int i = 0; i < other.indices.length; ++i) { |
| final long index = other.indices[i]; |
| if (index != 0) { |
| or(i, index, other.bits[i], Long.bitCount(index)); |
| } |
| } |
| } |
| |
| /** |
| * {@link #or(DocIdSetIterator)} impl that works best when <code>it</code> is dense |
| */ |
| private void orDense(DocIdSetIterator it) throws IOException { |
| checkUnpositioned(it); |
| // The goal here is to try to take advantage of the ordering of documents |
| // to build the data-structure more efficiently |
| // NOTE: this heavily relies on the fact that shifts are mod 64 |
| final int firstDoc = it.nextDoc(); |
| if (firstDoc == DocIdSetIterator.NO_MORE_DOCS) { |
| return; |
| } |
| int i4096 = firstDoc >>> 12; |
| int i64 = firstDoc >>> 6; |
| long index = 1L << i64; |
| long currentLong = 1L << firstDoc; |
| // we store at most 64 longs per block so preallocate in order never to have to resize |
| long[] longs = new long[64]; |
| int numLongs = 0; |
| |
| for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { |
| final int doc64 = doc >>> 6; |
| if (doc64 == i64) { |
| // still in the same long, just set the bit |
| currentLong |= 1L << doc; |
| } else { |
| longs[numLongs++] = currentLong; |
| |
| final int doc4096 = doc >>> 12; |
| if (doc4096 == i4096) { |
| index |= 1L << doc64; |
| } else { |
| // we are on a new block, flush what we buffered |
| or(i4096, index, longs, numLongs); |
| // and reset state for the new block |
| i4096 = doc4096; |
| index = 1L << doc64; |
| numLongs = 0; |
| } |
| |
| // we are on a new long, reset state |
| i64 = doc64; |
| currentLong = 1L << doc; |
| } |
| } |
| |
| // flush |
| longs[numLongs++] = currentLong; |
| or(i4096, index, longs, numLongs); |
| } |
| |
| @Override |
| public void or(DocIdSetIterator it) throws IOException { |
| { |
| // specialize union with another SparseFixedBitSet |
| final SparseFixedBitSet other = BitSetIterator.getSparseFixedBitSetOrNull(it); |
| if (other != null) { |
| checkUnpositioned(it); |
| or(other); |
| return; |
| } |
| } |
| |
| // We do not specialize the union with a FixedBitSet since FixedBitSets are |
| // supposed to be used for dense data and sparse fixed bit sets for sparse |
| // data, so a sparse set would likely get upgraded by DocIdSetBuilder before |
| // being or'ed with a FixedBitSet |
| |
| if (it.cost() < indices.length) { |
| // the default impl is good for sparse iterators |
| super.or(it); |
| } else { |
| orDense(it); |
| } |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return ramBytesUsed; |
| } |
| |
| @Override |
| public String toString() { |
| return "SparseFixedBitSet(size=" + length + ",cardinality=~" + approximateCardinality(); |
| } |
| } |