| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util; |
| |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.search.DocIdSet; |
| import org.apache.lucene.search.DocIdSetIterator; |
| |
| /** |
| * {@link DocIdSet} implementation inspired from http://roaringbitmap.org/ |
| * |
| * The space is divided into blocks of 2^16 bits and each block is encoded |
| * independently. In each block, if less than 2^12 bits are set, then |
| * documents are simply stored in a short[]. If more than 2^16-2^12 bits are |
| * set, then the inverse of the set is encoded in a simple short[]. Otherwise |
| * a {@link FixedBitSet} is used. |
| * |
| * @lucene.internal |
| */ |
| public class RoaringDocIdSet extends DocIdSet { |
| |
| // Number of documents in a block |
| private static final int BLOCK_SIZE = 1 << 16; |
| // The maximum length for an array, beyond that point we switch to a bitset |
| private static final int MAX_ARRAY_LENGTH = 1 << 12; |
| private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(RoaringDocIdSet.class); |
| |
| /** A builder of {@link RoaringDocIdSet}s. */ |
| public static class Builder { |
| |
| private final int maxDoc; |
| private final DocIdSet[] sets; |
| |
| private int cardinality; |
| private int lastDocId; |
| private int currentBlock; |
| private int currentBlockCardinality; |
| |
| // We start by filling the buffer and when it's full we copy the content of |
| // the buffer to the FixedBitSet and put further documents in that bitset |
| private final short[] buffer; |
| private FixedBitSet denseBuffer; |
| |
| /** Sole constructor. */ |
| public Builder(int maxDoc) { |
| this.maxDoc = maxDoc; |
| sets = new DocIdSet[(maxDoc + (1 << 16) - 1) >>> 16]; |
| lastDocId = -1; |
| currentBlock = -1; |
| buffer = new short[MAX_ARRAY_LENGTH]; |
| } |
| |
| private void flush() { |
| assert currentBlockCardinality <= BLOCK_SIZE; |
| if (currentBlockCardinality <= MAX_ARRAY_LENGTH) { |
| // Use sparse encoding |
| assert denseBuffer == null; |
| if (currentBlockCardinality > 0) { |
| sets[currentBlock] = new ShortArrayDocIdSet(ArrayUtil.copyOfSubArray(buffer, 0, currentBlockCardinality)); |
| } |
| } else { |
| assert denseBuffer != null; |
| assert denseBuffer.cardinality() == currentBlockCardinality; |
| if (denseBuffer.length() == BLOCK_SIZE && BLOCK_SIZE - currentBlockCardinality < MAX_ARRAY_LENGTH) { |
| // Doc ids are very dense, inverse the encoding |
| final short[] excludedDocs = new short[BLOCK_SIZE - currentBlockCardinality]; |
| denseBuffer.flip(0, denseBuffer.length()); |
| int excludedDoc = -1; |
| for (int i = 0; i < excludedDocs.length; ++i) { |
| excludedDoc = denseBuffer.nextSetBit(excludedDoc + 1); |
| assert excludedDoc != DocIdSetIterator.NO_MORE_DOCS; |
| excludedDocs[i] = (short) excludedDoc; |
| } |
| assert excludedDoc + 1 == denseBuffer.length() || denseBuffer.nextSetBit(excludedDoc + 1) == DocIdSetIterator.NO_MORE_DOCS; |
| sets[currentBlock] = new NotDocIdSet(BLOCK_SIZE, new ShortArrayDocIdSet(excludedDocs)); |
| } else { |
| // Neither sparse nor super dense, use a fixed bit set |
| sets[currentBlock] = new BitDocIdSet(denseBuffer, currentBlockCardinality); |
| } |
| denseBuffer = null; |
| } |
| |
| cardinality += currentBlockCardinality; |
| denseBuffer = null; |
| currentBlockCardinality = 0; |
| } |
| |
| /** |
| * Add a new doc-id to this builder. |
| * NOTE: doc ids must be added in order. |
| */ |
| public Builder add(int docId) { |
| if (docId <= lastDocId) { |
| throw new IllegalArgumentException("Doc ids must be added in-order, got " + docId + " which is <= lastDocID=" + lastDocId); |
| } |
| final int block = docId >>> 16; |
| if (block != currentBlock) { |
| // we went to a different block, let's flush what we buffered and start from fresh |
| flush(); |
| currentBlock = block; |
| } |
| |
| if (currentBlockCardinality < MAX_ARRAY_LENGTH) { |
| buffer[currentBlockCardinality] = (short) docId; |
| } else { |
| if (denseBuffer == null) { |
| // the buffer is full, let's move to a fixed bit set |
| final int numBits = Math.min(1 << 16, maxDoc - (block << 16)); |
| denseBuffer = new FixedBitSet(numBits); |
| for (short doc : buffer) { |
| denseBuffer.set(doc & 0xFFFF); |
| } |
| } |
| denseBuffer.set(docId & 0xFFFF); |
| } |
| |
| lastDocId = docId; |
| currentBlockCardinality += 1; |
| return this; |
| } |
| |
| /** Add the content of the provided {@link DocIdSetIterator}. */ |
| public Builder add(DocIdSetIterator disi) throws IOException { |
| for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) { |
| add(doc); |
| } |
| return this; |
| } |
| |
| /** Build an instance. */ |
| public RoaringDocIdSet build() { |
| flush(); |
| return new RoaringDocIdSet(sets, cardinality); |
| } |
| |
| } |
| |
| /** |
| * {@link DocIdSet} implementation that can store documents up to 2^16-1 in a short[]. |
| */ |
| private static class ShortArrayDocIdSet extends DocIdSet { |
| |
| private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ShortArrayDocIdSet.class); |
| |
| private final short[] docIDs; |
| |
| private ShortArrayDocIdSet(short[] docIDs) { |
| this.docIDs = docIDs; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(docIDs); |
| } |
| |
| @Override |
| public DocIdSetIterator iterator() throws IOException { |
| return new DocIdSetIterator() { |
| |
| int i = -1; // this is the index of the current document in the array |
| int doc = -1; |
| |
| private int docId(int i) { |
| return docIDs[i] & 0xFFFF; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| if (++i >= docIDs.length) { |
| return doc = NO_MORE_DOCS; |
| } |
| return doc = docId(i); |
| } |
| |
| @Override |
| public int docID() { |
| return doc; |
| } |
| |
| @Override |
| public long cost() { |
| return docIDs.length; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| // binary search |
| int lo = i + 1; |
| int hi = docIDs.length - 1; |
| while (lo <= hi) { |
| final int mid = (lo + hi) >>> 1; |
| final int midDoc = docId(mid); |
| if (midDoc < target) { |
| lo = mid + 1; |
| } else { |
| hi = mid - 1; |
| } |
| } |
| if (lo == docIDs.length) { |
| i = docIDs.length; |
| return doc = NO_MORE_DOCS; |
| } else { |
| i = lo; |
| return doc = docId(i); |
| } |
| } |
| }; |
| } |
| |
| } |
| |
| private final DocIdSet[] docIdSets; |
| private final int cardinality; |
| private final long ramBytesUsed; |
| |
| private RoaringDocIdSet(DocIdSet[] docIdSets, int cardinality) { |
| this.docIdSets = docIdSets; |
| long ramBytesUsed = BASE_RAM_BYTES_USED + RamUsageEstimator.shallowSizeOf(docIdSets); |
| for (DocIdSet set : this.docIdSets) { |
| if (set != null) { |
| ramBytesUsed += set.ramBytesUsed(); |
| } |
| } |
| this.ramBytesUsed = ramBytesUsed; |
| this.cardinality = cardinality; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return ramBytesUsed; |
| } |
| |
| @Override |
| public DocIdSetIterator iterator() throws IOException { |
| if (cardinality == 0) { |
| return null; |
| } |
| return new Iterator(); |
| } |
| |
| private class Iterator extends DocIdSetIterator { |
| |
| int block; |
| DocIdSetIterator sub = null; |
| int doc; |
| |
| Iterator() throws IOException { |
| doc = -1; |
| block = -1; |
| sub = DocIdSetIterator.empty(); |
| } |
| |
| @Override |
| public int docID() { |
| return doc; |
| } |
| |
| @Override |
| public int nextDoc() throws IOException { |
| final int subNext = sub.nextDoc(); |
| if (subNext == NO_MORE_DOCS) { |
| return firstDocFromNextBlock(); |
| } |
| return doc = (block << 16) | subNext; |
| } |
| |
| @Override |
| public int advance(int target) throws IOException { |
| final int targetBlock = target >>> 16; |
| if (targetBlock != block) { |
| block = targetBlock; |
| if (block >= docIdSets.length) { |
| sub = null; |
| return doc = NO_MORE_DOCS; |
| } |
| if (docIdSets[block] == null) { |
| return firstDocFromNextBlock(); |
| } |
| sub = docIdSets[block].iterator(); |
| } |
| final int subNext = sub.advance(target & 0xFFFF); |
| if (subNext == NO_MORE_DOCS) { |
| return firstDocFromNextBlock(); |
| } |
| return doc = (block << 16) | subNext; |
| } |
| |
| private int firstDocFromNextBlock() throws IOException { |
| while (true) { |
| block += 1; |
| if (block >= docIdSets.length) { |
| sub = null; |
| return doc = NO_MORE_DOCS; |
| } else if (docIdSets[block] != null) { |
| sub = docIdSets[block].iterator(); |
| final int subNext = sub.nextDoc(); |
| assert subNext != NO_MORE_DOCS; |
| return doc = (block << 16) | subNext; |
| } |
| } |
| } |
| |
| @Override |
| public long cost() { |
| return cardinality; |
| } |
| |
| } |
| |
| /** Return the exact number of documents that are contained in this set. */ |
| public int cardinality() { |
| return cardinality; |
| } |
| |
| @Override |
| public String toString() { |
| return "RoaringDocIdSet(cardinality=" + cardinality + ")"; |
| } |
| } |