blob: e57292b08c9aa08c83f127154c79d3066b9b6af4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.packed.PackedInts;
/**
* A builder of {@link DocIdSet}s. At first it uses a sparse structure to gather
* documents, and then upgrades to a non-sparse bit set once enough hits match.
*
* To add documents, you first need to call {@link #grow} in order to reserve
* space, and then call {@link BulkAdder#add(int)} on the returned
* {@link BulkAdder}.
*
* @lucene.internal
*/
public final class DocIdSetBuilder {
/** Utility class to efficiently add many docs in one go.
* @see DocIdSetBuilder#grow */
public static abstract class BulkAdder {
public abstract void add(int doc);
}
private static class FixedBitSetAdder extends BulkAdder {
final FixedBitSet bitSet;
FixedBitSetAdder(FixedBitSet bitSet) {
this.bitSet = bitSet;
}
@Override
public void add(int doc) {
bitSet.set(doc);
}
}
private static class Buffer {
int[] array;
int length;
Buffer(int length) {
this.array = new int[length];
this.length = 0;
}
Buffer(int[] array, int length) {
this.array = array;
this.length = length;
}
}
private static class BufferAdder extends BulkAdder {
final Buffer buffer;
BufferAdder(Buffer buffer) {
this.buffer = buffer;
}
@Override
public void add(int doc) {
buffer.array[buffer.length++] = doc;
}
}
private final int maxDoc;
private final int threshold;
// pkg-private for testing
final boolean multivalued;
final double numValuesPerDoc;
private List<Buffer> buffers = new ArrayList<>();
private int totalAllocated; // accumulated size of the allocated buffers
private FixedBitSet bitSet;
private long counter = -1;
private BulkAdder adder;
/**
* Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
*/
public DocIdSetBuilder(int maxDoc) {
this(maxDoc, -1, -1);
}
/** Create a {@link DocIdSetBuilder} instance that is optimized for
* accumulating docs that match the given {@link Terms}. */
public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
}
/** Create a {@link DocIdSetBuilder} instance that is optimized for
* accumulating docs that match the given {@link PointValues}. */
public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
this(maxDoc, values.getDocCount(), values.size());
}
DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
this.maxDoc = maxDoc;
this.multivalued = docCount < 0 || docCount != valueCount;
if (docCount <= 0 || valueCount < 0) {
// assume one value per doc, this means the cost will be overestimated
// if the docs are actually multi-valued
this.numValuesPerDoc = 1;
} else {
// otherwise compute from index stats
this.numValuesPerDoc = (double) valueCount / docCount;
}
assert numValuesPerDoc >= 1: "valueCount=" + valueCount + " docCount=" + docCount;
// For ridiculously small sets, we'll just use a sorted int[]
// maxDoc >>> 7 is a good value if you want to save memory, lower values
// such as maxDoc >>> 11 should provide faster building but at the expense
// of using a full bitset even for quite sparse data
this.threshold = maxDoc >>> 7;
this.bitSet = null;
}
/**
* Add the content of the provided {@link DocIdSetIterator} to this builder.
* NOTE: if you need to build a {@link DocIdSet} out of a single
* {@link DocIdSetIterator}, you should rather use {@link RoaringDocIdSet.Builder}.
*/
public void add(DocIdSetIterator iter) throws IOException {
if (bitSet != null) {
bitSet.or(iter);
return;
}
int cost = (int) Math.min(Integer.MAX_VALUE, iter.cost());
BulkAdder adder = grow(cost);
for (int i = 0; i < cost; ++i) {
int doc = iter.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
return;
}
adder.add(doc);
}
for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
grow(1).add(doc);
}
}
/**
* Reserve space and return a {@link BulkAdder} object that can be used to
* add up to {@code numDocs} documents.
*/
public BulkAdder grow(int numDocs) {
if (bitSet == null) {
if ((long) totalAllocated + numDocs <= threshold) {
ensureBufferCapacity(numDocs);
} else {
upgradeToBitSet();
counter += numDocs;
}
} else {
counter += numDocs;
}
return adder;
}
private void ensureBufferCapacity(int numDocs) {
if (buffers.isEmpty()) {
addBuffer(additionalCapacity(numDocs));
return;
}
Buffer current = buffers.get(buffers.size() - 1);
if (current.array.length - current.length >= numDocs) {
// current buffer is large enough
return;
}
if (current.length < current.array.length - (current.array.length >>> 3)) {
// current buffer is less than 7/8 full, resize rather than waste space
growBuffer(current, additionalCapacity(numDocs));
} else {
addBuffer(additionalCapacity(numDocs));
}
}
private int additionalCapacity(int numDocs) {
// exponential growth: the new array has a size equal to the sum of what
// has been allocated so far
int c = totalAllocated;
// but is also >= numDocs + 1 so that we can store the next batch of docs
// (plus an empty slot so that we are more likely to reuse the array in build())
c = Math.max(numDocs + 1, c);
// avoid cold starts
c = Math.max(32, c);
// do not go beyond the threshold
c = Math.min(threshold - totalAllocated, c);
return c;
}
private Buffer addBuffer(int len) {
Buffer buffer = new Buffer(len);
buffers.add(buffer);
adder = new BufferAdder(buffer);
totalAllocated += buffer.array.length;
return buffer;
}
private void growBuffer(Buffer buffer, int additionalCapacity) {
buffer.array = ArrayUtil.growExact(buffer.array, buffer.array.length + additionalCapacity);
totalAllocated += additionalCapacity;
}
private void upgradeToBitSet() {
assert bitSet == null;
FixedBitSet bitSet = new FixedBitSet(maxDoc);
long counter = 0;
for (Buffer buffer : buffers) {
int[] array = buffer.array;
int length = buffer.length;
counter += length;
for (int i = 0; i < length; ++i) {
bitSet.set(array[i]);
}
}
this.bitSet = bitSet;
this.counter = counter;
this.buffers = null;
this.adder = new FixedBitSetAdder(bitSet);
}
/**
* Build a {@link DocIdSet} from the accumulated doc IDs.
*/
public DocIdSet build() {
try {
if (bitSet != null) {
assert counter >= 0;
final long cost = Math.round(counter / numValuesPerDoc);
return new BitDocIdSet(bitSet, cost);
} else {
Buffer concatenated = concat(buffers);
LSBRadixSorter sorter = new LSBRadixSorter();
sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length);
final int l;
if (multivalued) {
l = dedup(concatenated.array, concatenated.length);
} else {
assert noDups(concatenated.array, concatenated.length);
l = concatenated.length;
}
assert l <= concatenated.length;
concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS;
return new IntArrayDocIdSet(concatenated.array, l);
}
} finally {
this.buffers = null;
this.bitSet = null;
}
}
/**
* Concatenate the buffers in any order, leaving at least one empty slot in
* the end
* NOTE: this method might reuse one of the arrays
*/
private static Buffer concat(List<Buffer> buffers) {
int totalLength = 0;
Buffer largestBuffer = null;
for (Buffer buffer : buffers) {
totalLength += buffer.length;
if (largestBuffer == null || buffer.array.length > largestBuffer.array.length) {
largestBuffer = buffer;
}
}
if (largestBuffer == null) {
return new Buffer(1);
}
int[] docs = largestBuffer.array;
if (docs.length < totalLength + 1) {
docs = ArrayUtil.growExact(docs, totalLength + 1);
}
totalLength = largestBuffer.length;
for (Buffer buffer : buffers) {
if (buffer != largestBuffer) {
System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length);
totalLength += buffer.length;
}
}
return new Buffer(docs, totalLength);
}
private static int dedup(int[] arr, int length) {
if (length == 0) {
return 0;
}
int l = 1;
int previous = arr[0];
for (int i = 1; i < length; ++i) {
final int value = arr[i];
assert value >= previous;
if (value != previous) {
arr[l++] = value;
previous = value;
}
}
return l;
}
private static boolean noDups(int[] a, int len) {
for (int i = 1; i < len; ++i) {
assert a[i-1] < a[i];
}
return true;
}
}