solr/core/src/java/org/apache/solr/search/DocSetUtil.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.search;

 import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.LeafCollector;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.solr.common.SolrException;

 /** @lucene.experimental */
 public class DocSetUtil {

   /** The cut-off point for small sets (SortedIntDocSet) vs large sets (BitDocSet) */
   public static int smallSetSize(int maxDoc) {
     return (maxDoc>>6)+5;  // The +5 is for better test coverage for small sets
   }

   /**
    * Iterates DocSets to test for equality - slow and for testing purposes only.
    * @lucene.internal
    */
   public static boolean equals(DocSet a, DocSet b) {
     DocIterator iter1 = a.iterator();
     DocIterator iter2 = b.iterator();

     for (;;) {
       boolean n1 = iter1.hasNext();
       boolean n2 = iter2.hasNext();
       if (n1 != n2) {
         return false;
       }
       if (!n1) return true;  // made it to end
       int d1 = iter1.nextDoc();
       int d2 = iter2.nextDoc();
       if (d1 != d2) {
         return false;
       }
     }
   }

   /**
    * This variant of getDocSet will attempt to do some deduplication
    * on certain DocSets such as DocSets that match numDocs.  This means it can return
    * a cached version of the set, and the returned set should not be modified.
    * @lucene.experimental
    */
   public static DocSet getDocSet(DocSetCollector collector, SolrIndexSearcher searcher) {
     if (collector.size() == searcher.numDocs()) {
       if (!searcher.isLiveDocsInstantiated()) {
         searcher.setLiveDocs( collector.getDocSet() );
       }
       try {
         return searcher.getLiveDocSet();
       } catch (IOException e) {
         // should be impossible... liveDocs should exist, so no IO should be necessary
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
     }

     return collector.getDocSet();
   }

   /**
    * This variant of getDocSet maps all sets with size numDocs to searcher.getLiveDocs.
    * The returned set should not be modified.
    * @lucene.experimental
    */
   public static DocSet getDocSet(DocSet docs, SolrIndexSearcher searcher) {
     if (docs.size() == searcher.numDocs()) {
       if (!searcher.isLiveDocsInstantiated()) {
         searcher.setLiveDocs( docs );
       }
       try {
         // if this docset has the same cardinality as liveDocs, return liveDocs instead
         // so this set will be short lived garbage.
         return searcher.getLiveDocSet();
       } catch (IOException e) {
         // should be impossible... liveDocs should exist, so no IO should be necessary
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
     }

     return docs;
   }

   // implementers of DocSetProducer should not call this with themselves or it will result in an infinite loop
   public static DocSet createDocSet(SolrIndexSearcher searcher, Query query, DocSet filter) throws IOException {

     if (filter != null) {
       query = QueryUtils.combineQueryAndFilter(query, filter.getTopFilter());
     }

     if (query instanceof TermQuery) {
       DocSet set = createDocSet(searcher, ((TermQuery)query).getTerm() );
       // assert equals(set, createDocSetGeneric(searcher, query));
       return set;
     } else if (query instanceof DocSetProducer) {
       DocSet set = ((DocSetProducer) query).createDocSet(searcher);
       // assert equals(set, createDocSetGeneric(searcher, query));
       return set;
     }

     return createDocSetGeneric(searcher, query);
   }

   // code to produce docsets for non-docsetproducer queries
   public static DocSet createDocSetGeneric(SolrIndexSearcher searcher, Query query) throws IOException {

     int maxDoc = searcher.getIndexReader().maxDoc();
     DocSetCollector collector = new DocSetCollector(maxDoc);

     // This may throw an ExitableDirectoryReader.ExitingReaderException
     // but we should not catch it here, as we don't know how this DocSet will be used (it could be negated before use) or cached.
     searcher.search(query, collector);

     return getDocSet(collector, searcher);
   }

   public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException {
     DirectoryReader reader = searcher.getRawReader();  // raw reader to avoid extra wrapping overhead
     int maxDoc = searcher.getIndexReader().maxDoc();
     int smallSetSize = smallSetSize(maxDoc);

     String field = term.field();
     BytesRef termVal = term.bytes();

     int maxCount = 0;
     int firstReader = -1;
     List<LeafReaderContext> leaves = reader.leaves();
     PostingsEnum[] postList = new PostingsEnum[leaves.size()]; // use array for slightly higher scanning cost, but fewer memory allocations
     for (LeafReaderContext ctx : leaves) {
       assert leaves.get(ctx.ord) == ctx;
       LeafReader r = ctx.reader();
       Terms t = r.terms(field);
       if (t == null) continue;  // field is missing
       TermsEnum te = t.iterator();
       if (te.seekExact(termVal)) {
         maxCount += te.docFreq();
         postList[ctx.ord] = te.postings(null, PostingsEnum.NONE);
         if (firstReader < 0) firstReader = ctx.ord;
       }
     }

     DocSet answer = null;
     if (maxCount == 0) {
       answer = DocSet.EMPTY;
     } else if (maxCount <= smallSetSize) {
       answer = createSmallSet(leaves, postList, maxCount, firstReader);
     } else {
       answer = createBigSet(leaves, postList, maxDoc, firstReader);
     }

     return DocSetUtil.getDocSet( answer, searcher );
   }

   private static DocSet createSmallSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader) throws IOException {
     int[] docs = new int[maxPossible];
     int sz = 0;
     for (int i = firstReader; i < postList.length; i++) {
       PostingsEnum postings = postList[i];
       if (postings == null) continue;
       LeafReaderContext ctx = leaves.get(i);
       Bits liveDocs = ctx.reader().getLiveDocs();
       int base = ctx.docBase;
       for (; ; ) {
         int subId = postings.nextDoc();
         if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
         if (liveDocs != null && !liveDocs.get(subId)) continue;
         int globalId = subId + base;
         docs[sz++] = globalId;
       }
     }

     return new SortedIntDocSet(docs, sz);
   }


   private static DocSet createBigSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader) throws IOException {
     long[] bits = new long[FixedBitSet.bits2words(maxDoc)];
     int sz = 0;
     for (int i = firstReader; i < postList.length; i++) {
       PostingsEnum postings = postList[i];
       if (postings == null) continue;
       LeafReaderContext ctx = leaves.get(i);
       Bits liveDocs = ctx.reader().getLiveDocs();
       int base = ctx.docBase;
       for (; ; ) {
         int subId = postings.nextDoc();
         if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
         if (liveDocs != null && !liveDocs.get(subId)) continue;
         int globalId = subId + base;
         bits[globalId >> 6] |= (1L << globalId);
         sz++;
       }
     }

     BitDocSet docSet = new BitDocSet( new FixedBitSet(bits, maxDoc), sz );

     int smallSetSize = smallSetSize(maxDoc);
     if (sz < smallSetSize) {
       // make this optional?
       DocSet smallSet = toSmallSet( docSet );
       // assert equals(docSet, smallSet);
       return smallSet;
     }

     return docSet;
   }

   public static DocSet toSmallSet(BitDocSet bitSet) {
     int sz = bitSet.size();
     int[] docs = new int[sz];
     FixedBitSet bs = bitSet.getBits();
     int doc = -1;
     for (int i=0; i<sz; i++) {
       doc = bs.nextSetBit(doc + 1);
       docs[i] = doc;
     }
     return new SortedIntDocSet(docs);
   }

   public static void collectSortedDocSet(DocSet docs, IndexReader reader, Collector collector) throws IOException {
     // TODO add SortedDocSet sub-interface and take that.
     // TODO collectUnsortedDocSet: iterate segment, then all docSet per segment.

     final List<LeafReaderContext> leaves = reader.leaves();
     final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
     int segBase = 0;
     int segMax;
     int adjustedMax = 0;
     LeafReaderContext ctx = null;
     LeafCollector leafCollector = null;
     for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
       final int doc = docsIt.nextDoc();
       if (doc >= adjustedMax) {
         do {
           ctx = ctxIt.next();
           segBase = ctx.docBase;
           segMax = ctx.reader().maxDoc();
           adjustedMax = segBase + segMax;
         } while (doc >= adjustedMax);
         leafCollector = collector.getLeafCollector(ctx);
       }
       if (doc < segBase) {
         throw new IllegalStateException("algorithm expects sorted DocSet but wasn't: " + docs.getClass());
       }
       leafCollector.collect(doc - segBase);  // per-seg collectors
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.search;

	import java.io.IOException;
	import java.util.Iterator;
	import java.util.List;

	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.LeafReader;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.Collector;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.LeafCollector;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.util.Bits;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.solr.common.SolrException;

	/** @lucene.experimental */
	public class DocSetUtil {

	/** The cut-off point for small sets (SortedIntDocSet) vs large sets (BitDocSet) */
	public static int smallSetSize(int maxDoc) {
	return (maxDoc>>6)+5; // The +5 is for better test coverage for small sets
	}

	/**
	* Iterates DocSets to test for equality - slow and for testing purposes only.
	* @lucene.internal
	*/
	public static boolean equals(DocSet a, DocSet b) {
	DocIterator iter1 = a.iterator();
	DocIterator iter2 = b.iterator();

	for (;;) {
	boolean n1 = iter1.hasNext();
	boolean n2 = iter2.hasNext();
	if (n1 != n2) {
	return false;
	}
	if (!n1) return true; // made it to end
	int d1 = iter1.nextDoc();
	int d2 = iter2.nextDoc();
	if (d1 != d2) {
	return false;
	}
	}
	}

	/**
	* This variant of getDocSet will attempt to do some deduplication
	* on certain DocSets such as DocSets that match numDocs. This means it can return
	* a cached version of the set, and the returned set should not be modified.
	* @lucene.experimental
	*/
	public static DocSet getDocSet(DocSetCollector collector, SolrIndexSearcher searcher) {
	if (collector.size() == searcher.numDocs()) {
	if (!searcher.isLiveDocsInstantiated()) {
	searcher.setLiveDocs( collector.getDocSet() );
	}
	try {
	return searcher.getLiveDocSet();
	} catch (IOException e) {
	// should be impossible... liveDocs should exist, so no IO should be necessary
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	}
	}

	return collector.getDocSet();
	}

	/**
	* This variant of getDocSet maps all sets with size numDocs to searcher.getLiveDocs.
	* The returned set should not be modified.
	* @lucene.experimental
	*/
	public static DocSet getDocSet(DocSet docs, SolrIndexSearcher searcher) {
	if (docs.size() == searcher.numDocs()) {
	if (!searcher.isLiveDocsInstantiated()) {
	searcher.setLiveDocs( docs );
	}
	try {
	// if this docset has the same cardinality as liveDocs, return liveDocs instead
	// so this set will be short lived garbage.
	return searcher.getLiveDocSet();
	} catch (IOException e) {
	// should be impossible... liveDocs should exist, so no IO should be necessary
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	}
	}

	return docs;
	}

	// implementers of DocSetProducer should not call this with themselves or it will result in an infinite loop
	public static DocSet createDocSet(SolrIndexSearcher searcher, Query query, DocSet filter) throws IOException {

	if (filter != null) {
	query = QueryUtils.combineQueryAndFilter(query, filter.getTopFilter());
	}

	if (query instanceof TermQuery) {
	DocSet set = createDocSet(searcher, ((TermQuery)query).getTerm() );
	// assert equals(set, createDocSetGeneric(searcher, query));
	return set;
	} else if (query instanceof DocSetProducer) {
	DocSet set = ((DocSetProducer) query).createDocSet(searcher);
	// assert equals(set, createDocSetGeneric(searcher, query));
	return set;
	}

	return createDocSetGeneric(searcher, query);
	}

	// code to produce docsets for non-docsetproducer queries
	public static DocSet createDocSetGeneric(SolrIndexSearcher searcher, Query query) throws IOException {

	int maxDoc = searcher.getIndexReader().maxDoc();
	DocSetCollector collector = new DocSetCollector(maxDoc);

	// This may throw an ExitableDirectoryReader.ExitingReaderException
	// but we should not catch it here, as we don't know how this DocSet will be used (it could be negated before use) or cached.
	searcher.search(query, collector);

	return getDocSet(collector, searcher);
	}

	public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException {
	DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead
	int maxDoc = searcher.getIndexReader().maxDoc();
	int smallSetSize = smallSetSize(maxDoc);

	String field = term.field();
	BytesRef termVal = term.bytes();

	int maxCount = 0;
	int firstReader = -1;
	List<LeafReaderContext> leaves = reader.leaves();
	PostingsEnum[] postList = new PostingsEnum[leaves.size()]; // use array for slightly higher scanning cost, but fewer memory allocations
	for (LeafReaderContext ctx : leaves) {
	assert leaves.get(ctx.ord) == ctx;
	LeafReader r = ctx.reader();
	Terms t = r.terms(field);
	if (t == null) continue; // field is missing
	TermsEnum te = t.iterator();
	if (te.seekExact(termVal)) {
	maxCount += te.docFreq();
	postList[ctx.ord] = te.postings(null, PostingsEnum.NONE);
	if (firstReader < 0) firstReader = ctx.ord;
	}
	}

	DocSet answer = null;
	if (maxCount == 0) {
	answer = DocSet.EMPTY;
	} else if (maxCount <= smallSetSize) {
	answer = createSmallSet(leaves, postList, maxCount, firstReader);
	} else {
	answer = createBigSet(leaves, postList, maxDoc, firstReader);
	}

	return DocSetUtil.getDocSet( answer, searcher );
	}

	private static DocSet createSmallSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader) throws IOException {
	int[] docs = new int[maxPossible];
	int sz = 0;
	for (int i = firstReader; i < postList.length; i++) {
	PostingsEnum postings = postList[i];
	if (postings == null) continue;
	LeafReaderContext ctx = leaves.get(i);
	Bits liveDocs = ctx.reader().getLiveDocs();
	int base = ctx.docBase;
	for (; ; ) {
	int subId = postings.nextDoc();
	if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
	if (liveDocs != null && !liveDocs.get(subId)) continue;
	int globalId = subId + base;
	docs[sz++] = globalId;
	}
	}

	return new SortedIntDocSet(docs, sz);
	}


	private static DocSet createBigSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader) throws IOException {
	long[] bits = new long[FixedBitSet.bits2words(maxDoc)];
	int sz = 0;
	for (int i = firstReader; i < postList.length; i++) {
	PostingsEnum postings = postList[i];
	if (postings == null) continue;
	LeafReaderContext ctx = leaves.get(i);
	Bits liveDocs = ctx.reader().getLiveDocs();
	int base = ctx.docBase;
	for (; ; ) {
	int subId = postings.nextDoc();
	if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
	if (liveDocs != null && !liveDocs.get(subId)) continue;
	int globalId = subId + base;
	bits[globalId >> 6] \|= (1L << globalId);
	sz++;
	}
	}

	BitDocSet docSet = new BitDocSet( new FixedBitSet(bits, maxDoc), sz );

	int smallSetSize = smallSetSize(maxDoc);
	if (sz < smallSetSize) {
	// make this optional?
	DocSet smallSet = toSmallSet( docSet );
	// assert equals(docSet, smallSet);
	return smallSet;
	}

	return docSet;
	}

	public static DocSet toSmallSet(BitDocSet bitSet) {
	int sz = bitSet.size();
	int[] docs = new int[sz];
	FixedBitSet bs = bitSet.getBits();
	int doc = -1;
	for (int i=0; i<sz; i++) {
	doc = bs.nextSetBit(doc + 1);
	docs[i] = doc;
	}
	return new SortedIntDocSet(docs);
	}

	public static void collectSortedDocSet(DocSet docs, IndexReader reader, Collector collector) throws IOException {
	// TODO add SortedDocSet sub-interface and take that.
	// TODO collectUnsortedDocSet: iterate segment, then all docSet per segment.

	final List<LeafReaderContext> leaves = reader.leaves();
	final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
	int segBase = 0;
	int segMax;
	int adjustedMax = 0;
	LeafReaderContext ctx = null;
	LeafCollector leafCollector = null;
	for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
	final int doc = docsIt.nextDoc();
	if (doc >= adjustedMax) {
	do {
	ctx = ctxIt.next();
	segBase = ctx.docBase;
	segMax = ctx.reader().maxDoc();
	adjustedMax = segBase + segMax;
	} while (doc >= adjustedMax);
	leafCollector = collector.getLeafCollector(ctx);
	}
	if (doc < segBase) {
	throw new IllegalStateException("algorithm expects sorted DocSet but wasn't: " + docs.getClass());
	}
	leafCollector.collect(doc - segBase); // per-seg collectors
	}
	}

	}