lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.packed.PackedInts;

 /**
  * A builder of {@link DocIdSet}s.  At first it uses a sparse structure to gather
  * documents, and then upgrades to a non-sparse bit set once enough hits match.
  *
  * To add documents, you first need to call {@link #grow} in order to reserve
  * space, and then call {@link BulkAdder#add(int)} on the returned
  * {@link BulkAdder}.
  *
  * @lucene.internal
  */
 public final class DocIdSetBuilder {

   /** Utility class to efficiently add many docs in one go.
    *  @see DocIdSetBuilder#grow */
   public static abstract class BulkAdder {
     public abstract void add(int doc);
   }

   private static class FixedBitSetAdder extends BulkAdder {
     final FixedBitSet bitSet;

     FixedBitSetAdder(FixedBitSet bitSet) {
       this.bitSet = bitSet;
     }

     @Override
     public void add(int doc) {
       bitSet.set(doc);
     }
   }

   private static class Buffer {
     int[] array;
     int length;

     Buffer(int length) {
       this.array = new int[length];
       this.length = 0;
     }

     Buffer(int[] array, int length) {
       this.array = array;
       this.length = length;
     }
   }

   private static class BufferAdder extends BulkAdder {
     final Buffer buffer;

     BufferAdder(Buffer buffer) {
       this.buffer = buffer;
     }

     @Override
     public void add(int doc) {
       buffer.array[buffer.length++] = doc;
     }
   }

   private final int maxDoc;
   private final int threshold;
   // pkg-private for testing
   final boolean multivalued;
   final double numValuesPerDoc;

   private List<Buffer> buffers = new ArrayList<>();
   private int totalAllocated; // accumulated size of the allocated buffers

   private FixedBitSet bitSet;

   private long counter = -1;
   private BulkAdder adder;

   /**
    * Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
    */
   public DocIdSetBuilder(int maxDoc) {
     this(maxDoc, -1, -1);
   }

   /** Create a {@link DocIdSetBuilder} instance that is optimized for
    *  accumulating docs that match the given {@link Terms}. */
   public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
     this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
   }

   /** Create a {@link DocIdSetBuilder} instance that is optimized for
    *  accumulating docs that match the given {@link PointValues}. */
   public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
     this(maxDoc, values.getDocCount(), values.size());
   }

   DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
     this.maxDoc = maxDoc;
     this.multivalued = docCount < 0 || docCount != valueCount;
     if (docCount <= 0 || valueCount < 0) {
       // assume one value per doc, this means the cost will be overestimated
       // if the docs are actually multi-valued
       this.numValuesPerDoc = 1;
     } else {
       // otherwise compute from index stats
       this.numValuesPerDoc = (double) valueCount / docCount;
     }

     assert numValuesPerDoc >= 1: "valueCount=" + valueCount + " docCount=" + docCount;

     // For ridiculously small sets, we'll just use a sorted int[]
     // maxDoc >>> 7 is a good value if you want to save memory, lower values
     // such as maxDoc >>> 11 should provide faster building but at the expense
     // of using a full bitset even for quite sparse data
     this.threshold = maxDoc >>> 7;

     this.bitSet = null;
   }

   /**
    * Add the content of the provided {@link DocIdSetIterator} to this builder.
    * NOTE: if you need to build a {@link DocIdSet} out of a single
    * {@link DocIdSetIterator}, you should rather use {@link RoaringDocIdSet.Builder}.
    */
   public void add(DocIdSetIterator iter) throws IOException {
     if (bitSet != null) {
       bitSet.or(iter);
       return;
     }
     int cost = (int) Math.min(Integer.MAX_VALUE, iter.cost());
     BulkAdder adder = grow(cost);
     for (int i = 0; i < cost; ++i) {
       int doc = iter.nextDoc();
       if (doc == DocIdSetIterator.NO_MORE_DOCS) {
         return;
       }
       adder.add(doc);
     }
     for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
       grow(1).add(doc);
     }
   }

   /**
    * Reserve space and return a {@link BulkAdder} object that can be used to
    * add up to {@code numDocs} documents.
    */
   public BulkAdder grow(int numDocs) {
     if (bitSet == null) {
       if ((long) totalAllocated + numDocs <= threshold) {
         ensureBufferCapacity(numDocs);
       } else {
         upgradeToBitSet();
         counter += numDocs;
       }
     } else {
       counter += numDocs;
     }
     return adder;
   }

   private void ensureBufferCapacity(int numDocs) {
     if (buffers.isEmpty()) {
       addBuffer(additionalCapacity(numDocs));
       return;
     }

     Buffer current = buffers.get(buffers.size() - 1);
     if (current.array.length - current.length >= numDocs) {
       // current buffer is large enough
       return;
     }
     if (current.length < current.array.length - (current.array.length >>> 3)) {
       // current buffer is less than 7/8 full, resize rather than waste space
       growBuffer(current, additionalCapacity(numDocs));
     } else {
       addBuffer(additionalCapacity(numDocs));
     }
   }

   private int additionalCapacity(int numDocs) {
     // exponential growth: the new array has a size equal to the sum of what
     // has been allocated so far
     int c = totalAllocated;
     // but is also >= numDocs + 1 so that we can store the next batch of docs
     // (plus an empty slot so that we are more likely to reuse the array in build())
     c = Math.max(numDocs + 1, c);
     // avoid cold starts
     c = Math.max(32, c);
     // do not go beyond the threshold
     c = Math.min(threshold - totalAllocated, c);
     return c;
   }

   private Buffer addBuffer(int len) {
     Buffer buffer = new Buffer(len);
     buffers.add(buffer);
     adder = new BufferAdder(buffer);
     totalAllocated += buffer.array.length;
     return buffer;
   }

   private void growBuffer(Buffer buffer, int additionalCapacity) {
     buffer.array = ArrayUtil.growExact(buffer.array, buffer.array.length + additionalCapacity);
     totalAllocated += additionalCapacity;
   }

   private void upgradeToBitSet() {
     assert bitSet == null;
     FixedBitSet bitSet = new FixedBitSet(maxDoc);
     long counter = 0;
     for (Buffer buffer : buffers) {
       int[] array = buffer.array;
       int length = buffer.length;
       counter += length;
       for (int i = 0; i < length; ++i) {
         bitSet.set(array[i]);
       }
     }
     this.bitSet = bitSet;
     this.counter = counter;
     this.buffers = null;
     this.adder = new FixedBitSetAdder(bitSet);
   }

   /**
    * Build a {@link DocIdSet} from the accumulated doc IDs.
    */
   public DocIdSet build() {
     try {
       if (bitSet != null) {
         assert counter >= 0;
         final long cost = Math.round(counter / numValuesPerDoc);
         return new BitDocIdSet(bitSet, cost);
       } else {
         Buffer concatenated = concat(buffers);
         LSBRadixSorter sorter = new LSBRadixSorter();
         sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length);
         final int l;
         if (multivalued) {
           l = dedup(concatenated.array, concatenated.length);
         } else {
           assert noDups(concatenated.array, concatenated.length);
           l = concatenated.length;
         }
         assert l <= concatenated.length;
         concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS;
         return new IntArrayDocIdSet(concatenated.array, l);
       }
     } finally {
       this.buffers = null;
       this.bitSet = null;
     }
   }

   /**
    * Concatenate the buffers in any order, leaving at least one empty slot in
    * the end
    * NOTE: this method might reuse one of the arrays
    */
   private static Buffer concat(List<Buffer> buffers) {
     int totalLength = 0;
     Buffer largestBuffer = null;
     for (Buffer buffer : buffers) {
       totalLength += buffer.length;
       if (largestBuffer == null || buffer.array.length > largestBuffer.array.length) {
         largestBuffer = buffer;
       }
     }
     if (largestBuffer == null) {
       return new Buffer(1);
     }
     int[] docs = largestBuffer.array;
     if (docs.length < totalLength + 1) {
       docs = ArrayUtil.growExact(docs, totalLength + 1);
     }
     totalLength = largestBuffer.length;
     for (Buffer buffer : buffers) {
       if (buffer != largestBuffer) {
         System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length);
         totalLength += buffer.length;
       }
     }
     return new Buffer(docs, totalLength);
   }

   private static int dedup(int[] arr, int length) {
     if (length == 0) {
       return 0;
     }
     int l = 1;
     int previous = arr[0];
     for (int i = 1; i < length; ++i) {
       final int value = arr[i];
       assert value >= previous;
       if (value != previous) {
         arr[l++] = value;
         previous = value;
       }
     }
     return l;
   }

   private static boolean noDups(int[] a, int len) {
     for (int i = 1; i < len; ++i) {
       assert a[i-1] < a[i];
     }
     return true;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.index.PointValues;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.search.DocIdSet;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.util.packed.PackedInts;

	/**
	* A builder of {@link DocIdSet}s. At first it uses a sparse structure to gather
	* documents, and then upgrades to a non-sparse bit set once enough hits match.
	*
	* To add documents, you first need to call {@link #grow} in order to reserve
	* space, and then call {@link BulkAdder#add(int)} on the returned
	* {@link BulkAdder}.
	*
	* @lucene.internal
	*/
	public final class DocIdSetBuilder {

	/** Utility class to efficiently add many docs in one go.
	* @see DocIdSetBuilder#grow */
	public static abstract class BulkAdder {
	public abstract void add(int doc);
	}

	private static class FixedBitSetAdder extends BulkAdder {
	final FixedBitSet bitSet;

	FixedBitSetAdder(FixedBitSet bitSet) {
	this.bitSet = bitSet;
	}

	@Override
	public void add(int doc) {
	bitSet.set(doc);
	}
	}

	private static class Buffer {
	int[] array;
	int length;

	Buffer(int length) {
	this.array = new int[length];
	this.length = 0;
	}

	Buffer(int[] array, int length) {
	this.array = array;
	this.length = length;
	}
	}

	private static class BufferAdder extends BulkAdder {
	final Buffer buffer;

	BufferAdder(Buffer buffer) {
	this.buffer = buffer;
	}

	@Override
	public void add(int doc) {
	buffer.array[buffer.length++] = doc;
	}
	}

	private final int maxDoc;
	private final int threshold;
	// pkg-private for testing
	final boolean multivalued;
	final double numValuesPerDoc;

	private List<Buffer> buffers = new ArrayList<>();
	private int totalAllocated; // accumulated size of the allocated buffers

	private FixedBitSet bitSet;

	private long counter = -1;
	private BulkAdder adder;

	/**
	* Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
	*/
	public DocIdSetBuilder(int maxDoc) {
	this(maxDoc, -1, -1);
	}

	/** Create a {@link DocIdSetBuilder} instance that is optimized for
	* accumulating docs that match the given {@link Terms}. */
	public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
	this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
	}

	/** Create a {@link DocIdSetBuilder} instance that is optimized for
	* accumulating docs that match the given {@link PointValues}. */
	public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
	this(maxDoc, values.getDocCount(), values.size());
	}

	DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
	this.maxDoc = maxDoc;
	this.multivalued = docCount < 0 \|\| docCount != valueCount;
	if (docCount <= 0 \|\| valueCount < 0) {
	// assume one value per doc, this means the cost will be overestimated
	// if the docs are actually multi-valued
	this.numValuesPerDoc = 1;
	} else {
	// otherwise compute from index stats
	this.numValuesPerDoc = (double) valueCount / docCount;
	}

	assert numValuesPerDoc >= 1: "valueCount=" + valueCount + " docCount=" + docCount;

	// For ridiculously small sets, we'll just use a sorted int[]
	// maxDoc >>> 7 is a good value if you want to save memory, lower values
	// such as maxDoc >>> 11 should provide faster building but at the expense
	// of using a full bitset even for quite sparse data
	this.threshold = maxDoc >>> 7;

	this.bitSet = null;
	}

	/**
	* Add the content of the provided {@link DocIdSetIterator} to this builder.
	* NOTE: if you need to build a {@link DocIdSet} out of a single
	* {@link DocIdSetIterator}, you should rather use {@link RoaringDocIdSet.Builder}.
	*/
	public void add(DocIdSetIterator iter) throws IOException {
	if (bitSet != null) {
	bitSet.or(iter);
	return;
	}
	int cost = (int) Math.min(Integer.MAX_VALUE, iter.cost());
	BulkAdder adder = grow(cost);
	for (int i = 0; i < cost; ++i) {
	int doc = iter.nextDoc();
	if (doc == DocIdSetIterator.NO_MORE_DOCS) {
	return;
	}
	adder.add(doc);
	}
	for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
	grow(1).add(doc);
	}
	}

	/**
	* Reserve space and return a {@link BulkAdder} object that can be used to
	* add up to {@code numDocs} documents.
	*/
	public BulkAdder grow(int numDocs) {
	if (bitSet == null) {
	if ((long) totalAllocated + numDocs <= threshold) {
	ensureBufferCapacity(numDocs);
	} else {
	upgradeToBitSet();
	counter += numDocs;
	}
	} else {
	counter += numDocs;
	}
	return adder;
	}

	private void ensureBufferCapacity(int numDocs) {
	if (buffers.isEmpty()) {
	addBuffer(additionalCapacity(numDocs));
	return;
	}

	Buffer current = buffers.get(buffers.size() - 1);
	if (current.array.length - current.length >= numDocs) {
	// current buffer is large enough
	return;
	}
	if (current.length < current.array.length - (current.array.length >>> 3)) {
	// current buffer is less than 7/8 full, resize rather than waste space
	growBuffer(current, additionalCapacity(numDocs));
	} else {
	addBuffer(additionalCapacity(numDocs));
	}
	}

	private int additionalCapacity(int numDocs) {
	// exponential growth: the new array has a size equal to the sum of what
	// has been allocated so far
	int c = totalAllocated;
	// but is also >= numDocs + 1 so that we can store the next batch of docs
	// (plus an empty slot so that we are more likely to reuse the array in build())
	c = Math.max(numDocs + 1, c);
	// avoid cold starts
	c = Math.max(32, c);
	// do not go beyond the threshold
	c = Math.min(threshold - totalAllocated, c);
	return c;
	}

	private Buffer addBuffer(int len) {
	Buffer buffer = new Buffer(len);
	buffers.add(buffer);
	adder = new BufferAdder(buffer);
	totalAllocated += buffer.array.length;
	return buffer;
	}

	private void growBuffer(Buffer buffer, int additionalCapacity) {
	buffer.array = ArrayUtil.growExact(buffer.array, buffer.array.length + additionalCapacity);
	totalAllocated += additionalCapacity;
	}

	private void upgradeToBitSet() {
	assert bitSet == null;
	FixedBitSet bitSet = new FixedBitSet(maxDoc);
	long counter = 0;
	for (Buffer buffer : buffers) {
	int[] array = buffer.array;
	int length = buffer.length;
	counter += length;
	for (int i = 0; i < length; ++i) {
	bitSet.set(array[i]);
	}
	}
	this.bitSet = bitSet;
	this.counter = counter;
	this.buffers = null;
	this.adder = new FixedBitSetAdder(bitSet);
	}

	/**
	* Build a {@link DocIdSet} from the accumulated doc IDs.
	*/
	public DocIdSet build() {
	try {
	if (bitSet != null) {
	assert counter >= 0;
	final long cost = Math.round(counter / numValuesPerDoc);
	return new BitDocIdSet(bitSet, cost);
	} else {
	Buffer concatenated = concat(buffers);
	LSBRadixSorter sorter = new LSBRadixSorter();
	sorter.sort(PackedInts.bitsRequired(maxDoc - 1), concatenated.array, concatenated.length);
	final int l;
	if (multivalued) {
	l = dedup(concatenated.array, concatenated.length);
	} else {
	assert noDups(concatenated.array, concatenated.length);
	l = concatenated.length;
	}
	assert l <= concatenated.length;
	concatenated.array[l] = DocIdSetIterator.NO_MORE_DOCS;
	return new IntArrayDocIdSet(concatenated.array, l);
	}
	} finally {
	this.buffers = null;
	this.bitSet = null;
	}
	}

	/**
	* Concatenate the buffers in any order, leaving at least one empty slot in
	* the end
	* NOTE: this method might reuse one of the arrays
	*/
	private static Buffer concat(List<Buffer> buffers) {
	int totalLength = 0;
	Buffer largestBuffer = null;
	for (Buffer buffer : buffers) {
	totalLength += buffer.length;
	if (largestBuffer == null \|\| buffer.array.length > largestBuffer.array.length) {
	largestBuffer = buffer;
	}
	}
	if (largestBuffer == null) {
	return new Buffer(1);
	}
	int[] docs = largestBuffer.array;
	if (docs.length < totalLength + 1) {
	docs = ArrayUtil.growExact(docs, totalLength + 1);
	}
	totalLength = largestBuffer.length;
	for (Buffer buffer : buffers) {
	if (buffer != largestBuffer) {
	System.arraycopy(buffer.array, 0, docs, totalLength, buffer.length);
	totalLength += buffer.length;
	}
	}
	return new Buffer(docs, totalLength);
	}

	private static int dedup(int[] arr, int length) {
	if (length == 0) {
	return 0;
	}
	int l = 1;
	int previous = arr[0];
	for (int i = 1; i < length; ++i) {
	final int value = arr[i];
	assert value >= previous;
	if (value != previous) {
	arr[l++] = value;
	previous = value;
	}
	}
	return l;
	}

	private static boolean noDups(int[] a, int len) {
	for (int i = 1; i < len; ++i) {
	assert a[i-1] < a[i];
	}
	return true;
	}

	}