solr/core/src/java/org/apache/solr/search/DocSetBuilder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.search;

 import java.io.IOException;

 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.LSBRadixSorter;
 import org.apache.lucene.util.packed.PackedInts;

 /**
  * Adapted from DocIdSetBuilder to build DocSets
  *
  * @lucene.internal
  */
 public final class DocSetBuilder {

   private final int maxDoc;
   private final int threshold;

   private int[] buffer;
   private int pos;

   private FixedBitSet bitSet;


   public DocSetBuilder(int maxDoc, long costEst) {
     this.maxDoc = maxDoc;
     // For ridiculously small sets, we'll just use a sorted int[]
     // maxDoc >>> 7 is a good value if you want to save memory, lower values
     // such as maxDoc >>> 11 should provide faster building but at the expense
     // of using a full bitset even for quite sparse data
     this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes

     if (costEst > threshold) {
       bitSet = new FixedBitSet(maxDoc);
     } else {
       this.buffer = new int[Math.max((int)costEst,1)];
     }
   }

   private void upgradeToBitSet() {
     assert bitSet == null;
     bitSet = new FixedBitSet(maxDoc);
     for (int i = 0; i < pos; ++i) {
       bitSet.set(buffer[i]);
     }
     this.buffer = null;
     this.pos = 0;
   }

   private void growBuffer(int minSize) {
     if (minSize < buffer.length) return;

     int newSize = buffer.length;
     while (newSize < minSize) {
       newSize = newSize << 1;
     }
     newSize = Math.min(newSize, threshold);

     int[] newBuffer = new int[newSize];
     System.arraycopy(buffer, 0, newBuffer, 0, pos);
     buffer = newBuffer;
   }

   public void add(DocIdSetIterator iter, int base) throws IOException {
     grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));

     if (bitSet != null) {
       add(bitSet, iter, base);
     } else {
       while (true) {
         for (int i = pos; i < buffer.length; ++i) {
           final int doc = iter.nextDoc();
           if (doc == DocIdSetIterator.NO_MORE_DOCS) {
             pos = i; // update pos
             return;
           }
           buffer[i] = doc + base;  // using the loop counter may help with removal of bounds checking
         }

         pos = buffer.length; // update pos
         if (pos + 1 >= threshold) {
           break;
         }

         growBuffer(pos + 1);
       }

       upgradeToBitSet();
       add(bitSet, iter, base);
     }
   }


   public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException {
     for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
       bitSet.set(doc + base);
     }
   }

   /** Returns the number of terms visited */
   public int add(TermsEnum te, int base) throws IOException {
     PostingsEnum postings = null;

     int termCount = 0;
     for(;;) {
       BytesRef term = te.next();
       if (term == null) break;
       termCount++;
       postings = te.postings(postings, PostingsEnum.NONE);
       add(postings, base);
     }

     return termCount;
   }


   public void grow(int numDocs) {
     if (bitSet == null) {
       final long newLength = pos + numDocs;
       if (newLength < threshold) {
         growBuffer((int) newLength);
       } else {
         upgradeToBitSet();
       }
     }
   }


   public void add(int doc) {
     if (bitSet != null) {
       bitSet.set(doc);
     } else {
       if (pos >= buffer.length) {
         if (pos + 1 >= threshold) {
           upgradeToBitSet();
           bitSet.set(doc);
           return;
         }
         growBuffer(pos + 1);
       }
       buffer[pos++] = doc;
     }
   }

   private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) {
     int pos = 0;
     int previous = -1;
     for (int i = 0; i < length; ++i) {
       final int value = arr[i];
       // assert value >= previous;
       if (value != previous && (acceptDocs == null || acceptDocs.get(value))) {
         arr[pos++] = value;
         previous = value;
       }
     }
     return pos;
   }


   public DocSet build(FixedBitSet filter) {
     if (bitSet != null) {
       if (filter != null) {
         bitSet.and(filter);
       }
       return new BitDocSet(bitSet);
       // TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize?
     } else {
       LSBRadixSorter sorter = new LSBRadixSorter();
       sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, pos);
       final int l = dedup(buffer, pos, filter);
       assert l <= pos;
       return new SortedIntDocSet(buffer, l);  // TODO: have option to not shrink in the future if it will be a temporary set
     }
   }

   /** Only use this if you know there were no duplicates and that docs were collected in-order! */
   public DocSet buildUniqueInOrder(FixedBitSet filter) {
     if (bitSet != null) {
       if (filter != null) {
         bitSet.and(filter);
       }
       return new BitDocSet(bitSet);
     } else {
       // don't need to sort, but still need to remove non accepted docs
       int l = pos;
       if (filter != null) {
         l = dedup(buffer, pos, filter);
       }
       return new SortedIntDocSet(buffer, l);  // TODO: have option to not shrink in the future if it will be a temporary set
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.search;

	import java.io.IOException;

	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.LSBRadixSorter;
	import org.apache.lucene.util.packed.PackedInts;

	/**
	* Adapted from DocIdSetBuilder to build DocSets
	*
	* @lucene.internal
	*/
	public final class DocSetBuilder {

	private final int maxDoc;
	private final int threshold;

	private int[] buffer;
	private int pos;

	private FixedBitSet bitSet;


	public DocSetBuilder(int maxDoc, long costEst) {
	this.maxDoc = maxDoc;
	// For ridiculously small sets, we'll just use a sorted int[]
	// maxDoc >>> 7 is a good value if you want to save memory, lower values
	// such as maxDoc >>> 11 should provide faster building but at the expense
	// of using a full bitset even for quite sparse data
	this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes

	if (costEst > threshold) {
	bitSet = new FixedBitSet(maxDoc);
	} else {
	this.buffer = new int[Math.max((int)costEst,1)];
	}
	}

	private void upgradeToBitSet() {
	assert bitSet == null;
	bitSet = new FixedBitSet(maxDoc);
	for (int i = 0; i < pos; ++i) {
	bitSet.set(buffer[i]);
	}
	this.buffer = null;
	this.pos = 0;
	}

	private void growBuffer(int minSize) {
	if (minSize < buffer.length) return;

	int newSize = buffer.length;
	while (newSize < minSize) {
	newSize = newSize << 1;
	}
	newSize = Math.min(newSize, threshold);

	int[] newBuffer = new int[newSize];
	System.arraycopy(buffer, 0, newBuffer, 0, pos);
	buffer = newBuffer;
	}

	public void add(DocIdSetIterator iter, int base) throws IOException {
	grow((int) Math.min(Integer.MAX_VALUE, iter.cost()));

	if (bitSet != null) {
	add(bitSet, iter, base);
	} else {
	while (true) {
	for (int i = pos; i < buffer.length; ++i) {
	final int doc = iter.nextDoc();
	if (doc == DocIdSetIterator.NO_MORE_DOCS) {
	pos = i; // update pos
	return;
	}
	buffer[i] = doc + base; // using the loop counter may help with removal of bounds checking
	}

	pos = buffer.length; // update pos
	if (pos + 1 >= threshold) {
	break;
	}

	growBuffer(pos + 1);
	}

	upgradeToBitSet();
	add(bitSet, iter, base);
	}
	}


	public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException {
	for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) {
	bitSet.set(doc + base);
	}
	}

	/** Returns the number of terms visited */
	public int add(TermsEnum te, int base) throws IOException {
	PostingsEnum postings = null;

	int termCount = 0;
	for(;;) {
	BytesRef term = te.next();
	if (term == null) break;
	termCount++;
	postings = te.postings(postings, PostingsEnum.NONE);
	add(postings, base);
	}

	return termCount;
	}


	public void grow(int numDocs) {
	if (bitSet == null) {
	final long newLength = pos + numDocs;
	if (newLength < threshold) {
	growBuffer((int) newLength);
	} else {
	upgradeToBitSet();
	}
	}
	}


	public void add(int doc) {
	if (bitSet != null) {
	bitSet.set(doc);
	} else {
	if (pos >= buffer.length) {
	if (pos + 1 >= threshold) {
	upgradeToBitSet();
	bitSet.set(doc);
	return;
	}
	growBuffer(pos + 1);
	}
	buffer[pos++] = doc;
	}
	}

	private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) {
	int pos = 0;
	int previous = -1;
	for (int i = 0; i < length; ++i) {
	final int value = arr[i];
	// assert value >= previous;
	if (value != previous && (acceptDocs == null \|\| acceptDocs.get(value))) {
	arr[pos++] = value;
	previous = value;
	}
	}
	return pos;
	}



	public DocSet build(FixedBitSet filter) {
	if (bitSet != null) {
	if (filter != null) {
	bitSet.and(filter);
	}
	return new BitDocSet(bitSet);
	// TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize?
	} else {
	LSBRadixSorter sorter = new LSBRadixSorter();
	sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, pos);
	final int l = dedup(buffer, pos, filter);
	assert l <= pos;
	return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set
	}
	}

	/** Only use this if you know there were no duplicates and that docs were collected in-order! */
	public DocSet buildUniqueInOrder(FixedBitSet filter) {
	if (bitSet != null) {
	if (filter != null) {
	bitSet.and(filter);
	}
	return new BitDocSet(bitSet);
	} else {
	// don't need to sort, but still need to remove non accepted docs
	int l = pos;
	if (filter != null) {
	l = dedup(buffer, pos, filter);
	}
	return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set
	}
	}

	}