blob: bd180c7a8ea596b8a0230d91b2f8ddfa23ed0d2b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LSBRadixSorter;
import org.apache.lucene.util.packed.PackedInts;
/**
* Adapted from DocIdSetBuilder to build DocSets
*
* @lucene.internal
*/
public final class DocSetBuilder {
private final int maxDoc;
private final int threshold;
private int[] buffer;
private int pos;
private FixedBitSet bitSet;
public DocSetBuilder(int maxDoc, long costEst) {
this.maxDoc = maxDoc;
// For ridiculously small sets, we'll just use a sorted int[]
// maxDoc >>> 7 is a good value if you want to save memory, lower values
// such as maxDoc >>> 11 should provide faster building but at the expense
// of using a full bitset even for quite sparse data
this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes
if (costEst > threshold) {
bitSet = new FixedBitSet(maxDoc);
} else {
this.buffer = new int[Math.max((int)costEst,1)];
}
}
private void upgradeToBitSet() {
assert bitSet == null;
bitSet = new FixedBitSet(maxDoc);
for (int i = 0; i < pos; ++i) {
bitSet.set(buffer[i]);
}
this.buffer = null;
this.pos = 0;
}
private void growBuffer(int minSize) {
if (minSize < buffer.length) return;
int newSize = buffer.length;
while (newSize < minSize) {
newSize = newSize << 1;
}
newSize = Math.min(newSize, threshold);
int[] newBuffer = new int[newSize];
System.arraycopy(buffer, 0, newBuffer, 0, pos);
buffer = newBuffer;
}
public void add(DocIdSetIterator iter, int base) throws IOException {
grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));
if (bitSet != null) {
add(bitSet, iter, base);
} else {
while (true) {
for (int i = pos; i < buffer.length; ++i) {
final int doc = iter.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
pos = i; // update pos
return;
}
buffer[i] = doc + base; // using the loop counter may help with removal of bounds checking
}
pos = buffer.length; // update pos
if (pos + 1 >= threshold) {
break;
}
growBuffer(pos + 1);
}
upgradeToBitSet();
add(bitSet, iter, base);
}
}
public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException {
for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
bitSet.set(doc + base);
}
}
/** Returns the number of terms visited */
public int add(TermsEnum te, int base) throws IOException {
PostingsEnum postings = null;
int termCount = 0;
for(;;) {
BytesRef term = te.next();
if (term == null) break;
termCount++;
postings = te.postings(postings, PostingsEnum.NONE);
add(postings, base);
}
return termCount;
}
public void grow(int numDocs) {
if (bitSet == null) {
final long newLength = pos + numDocs;
if (newLength < threshold) {
growBuffer((int) newLength);
} else {
upgradeToBitSet();
}
}
}
public void add(int doc) {
if (bitSet != null) {
bitSet.set(doc);
} else {
if (pos >= buffer.length) {
if (pos + 1 >= threshold) {
upgradeToBitSet();
bitSet.set(doc);
return;
}
growBuffer(pos + 1);
}
buffer[pos++] = doc;
}
}
private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) {
int pos = 0;
int previous = -1;
for (int i = 0; i < length; ++i) {
final int value = arr[i];
// assert value >= previous;
if (value != previous && (acceptDocs == null || acceptDocs.get(value))) {
arr[pos++] = value;
previous = value;
}
}
return pos;
}
public DocSet build(FixedBitSet filter) {
if (bitSet != null) {
if (filter != null) {
bitSet.and(filter);
}
return new BitDocSet(bitSet);
// TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize?
} else {
LSBRadixSorter sorter = new LSBRadixSorter();
sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, pos);
final int l = dedup(buffer, pos, filter);
assert l <= pos;
return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set
}
}
/** Only use this if you know there were no duplicates and that docs were collected in-order! */
public DocSet buildUniqueInOrder(FixedBitSet filter) {
if (bitSet != null) {
if (filter != null) {
bitSet.and(filter);
}
return new BitDocSet(bitSet);
} else {
// don't need to sort, but still need to remove non accepted docs
int l = pos;
if (filter != null) {
l = dedup(buffer, pos, filter);
}
return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set
}
}
}