| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.search; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.LSBRadixSorter; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| /** |
| * Adapted from DocIdSetBuilder to build DocSets |
| * |
| * @lucene.internal |
| */ |
| public final class DocSetBuilder { |
| |
| private final int maxDoc; |
| private final int threshold; |
| |
| private int[] buffer; |
| private int pos; |
| |
| private FixedBitSet bitSet; |
| |
| |
| public DocSetBuilder(int maxDoc, long costEst) { |
| this.maxDoc = maxDoc; |
| // For ridiculously small sets, we'll just use a sorted int[] |
| // maxDoc >>> 7 is a good value if you want to save memory, lower values |
| // such as maxDoc >>> 11 should provide faster building but at the expense |
| // of using a full bitset even for quite sparse data |
| this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes |
| |
| if (costEst > threshold) { |
| bitSet = new FixedBitSet(maxDoc); |
| } else { |
| this.buffer = new int[Math.max((int)costEst,1)]; |
| } |
| } |
| |
| private void upgradeToBitSet() { |
| assert bitSet == null; |
| bitSet = new FixedBitSet(maxDoc); |
| for (int i = 0; i < pos; ++i) { |
| bitSet.set(buffer[i]); |
| } |
| this.buffer = null; |
| this.pos = 0; |
| } |
| |
| private void growBuffer(int minSize) { |
| if (minSize < buffer.length) return; |
| |
| int newSize = buffer.length; |
| while (newSize < minSize) { |
| newSize = newSize << 1; |
| } |
| newSize = Math.min(newSize, threshold); |
| |
| int[] newBuffer = new int[newSize]; |
| System.arraycopy(buffer, 0, newBuffer, 0, pos); |
| buffer = newBuffer; |
| } |
| |
| public void add(DocIdSetIterator iter, int base) throws IOException { |
| grow((int) Math.min(Integer.MAX_VALUE, iter.cost())); |
| |
| if (bitSet != null) { |
| add(bitSet, iter, base); |
| } else { |
| while (true) { |
| for (int i = pos; i < buffer.length; ++i) { |
| final int doc = iter.nextDoc(); |
| if (doc == DocIdSetIterator.NO_MORE_DOCS) { |
| pos = i; // update pos |
| return; |
| } |
| buffer[i] = doc + base; // using the loop counter may help with removal of bounds checking |
| } |
| |
| pos = buffer.length; // update pos |
| if (pos + 1 >= threshold) { |
| break; |
| } |
| |
| growBuffer(pos + 1); |
| } |
| |
| upgradeToBitSet(); |
| add(bitSet, iter, base); |
| } |
| } |
| |
| |
| public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException { |
| for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { |
| bitSet.set(doc + base); |
| } |
| } |
| |
| /** Returns the number of terms visited */ |
| public int add(TermsEnum te, int base) throws IOException { |
| PostingsEnum postings = null; |
| |
| int termCount = 0; |
| for(;;) { |
| BytesRef term = te.next(); |
| if (term == null) break; |
| termCount++; |
| postings = te.postings(postings, PostingsEnum.NONE); |
| add(postings, base); |
| } |
| |
| return termCount; |
| } |
| |
| |
| public void grow(int numDocs) { |
| if (bitSet == null) { |
| final long newLength = pos + numDocs; |
| if (newLength < threshold) { |
| growBuffer((int) newLength); |
| } else { |
| upgradeToBitSet(); |
| } |
| } |
| } |
| |
| |
| public void add(int doc) { |
| if (bitSet != null) { |
| bitSet.set(doc); |
| } else { |
| if (pos >= buffer.length) { |
| if (pos + 1 >= threshold) { |
| upgradeToBitSet(); |
| bitSet.set(doc); |
| return; |
| } |
| growBuffer(pos + 1); |
| } |
| buffer[pos++] = doc; |
| } |
| } |
| |
| private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) { |
| int pos = 0; |
| int previous = -1; |
| for (int i = 0; i < length; ++i) { |
| final int value = arr[i]; |
| // assert value >= previous; |
| if (value != previous && (acceptDocs == null || acceptDocs.get(value))) { |
| arr[pos++] = value; |
| previous = value; |
| } |
| } |
| return pos; |
| } |
| |
| |
| |
| public DocSet build(FixedBitSet filter) { |
| if (bitSet != null) { |
| if (filter != null) { |
| bitSet.and(filter); |
| } |
| return new BitDocSet(bitSet); |
| // TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize? |
| } else { |
| LSBRadixSorter sorter = new LSBRadixSorter(); |
| sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, pos); |
| final int l = dedup(buffer, pos, filter); |
| assert l <= pos; |
| return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set |
| } |
| } |
| |
| /** Only use this if you know there were no duplicates and that docs were collected in-order! */ |
| public DocSet buildUniqueInOrder(FixedBitSet filter) { |
| if (bitSet != null) { |
| if (filter != null) { |
| bitSet.and(filter); |
| } |
| return new BitDocSet(bitSet); |
| } else { |
| // don't need to sort, but still need to remove non accepted docs |
| int l = pos; |
| if (filter != null) { |
| l = dedup(buffer, pos, filter); |
| } |
| return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set |
| } |
| } |
| |
| } |
| |