lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java - lucene-solr - Git at Google

 package org.apache.lucene.facet.sampling;

 import java.io.IOException;

 import org.apache.lucene.facet.params.FacetSearchParams;
 import org.apache.lucene.facet.search.DrillDownQuery;
 import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
 import org.apache.lucene.facet.taxonomy.CategoryPath;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.Bits;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * Fix sampling results by correct results, by counting the intersection between
  * two lists: a TermDocs (list of documents in a certain category) and a
  * DocIdSetIterator (list of documents matching the query).
  * <p>
  * This fixer is suitable for scenarios which prioritize accuracy over
  * performance.
  * <p>
  * <b>Note:</b> for statistically more accurate top-k selection, set
  * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
  * least 2, so that the top-k categories would have better chance of showing up
  * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
  *
  * @lucene.experimental
  */
 public class TakmiSampleFixer extends SampleFixer {

   private TaxonomyReader taxonomyReader;
   private IndexReader indexReader;
   private FacetSearchParams searchParams;

   public TakmiSampleFixer(IndexReader indexReader,
       TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
     this.indexReader = indexReader;
     this.taxonomyReader = taxonomyReader;
     this.searchParams = searchParams;
   }

   @Override
   public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
     recount(facetResNode, docIds);
   }

   /**
    * Internal utility: recount for a facet result node
    *
    * @param fresNode
    *          result node to be recounted
    * @param docIds
    *          full set of matching documents.
    * @throws IOException If there is a low-level I/O error.
    */
   private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException {
     // TODO (Facet): change from void to return the new, smaller docSet, and use
     // that for the children, as this will make their intersection ops faster.
     // can do this only when the new set is "sufficiently" smaller.

     /* We need the category's path name in order to do its recounting.
      * If it is missing, because the option to label only part of the
      * facet results was exercise, we need to calculate them anyway, so
      * in essence sampling with recounting spends some extra cycles for
      * labeling results for which labels are not required. */
     if (fresNode.label == null) {
       fresNode.label = taxonomyReader.getPath(fresNode.ordinal);
     }
     CategoryPath catPath = fresNode.label;

     Term drillDownTerm = DrillDownQuery.term(searchParams.indexingParams, catPath);
     // TODO (Facet): avoid Multi*?
     Bits liveDocs = MultiFields.getLiveDocs(indexReader);
     int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs,
                                                                      drillDownTerm.field(), drillDownTerm.bytes(),
                                                                      0), docIds.iterator());
     fresNode.value = updatedCount;
   }

   /**
    * Count the size of the intersection between two lists: a TermDocs (list of
    * documents in a certain category) and a DocIdSetIterator (list of documents
    * matching a query).
    */
   private static int countIntersection(DocsEnum p1, ScoredDocIDsIterator p2)
       throws IOException {
     // The documentation of of both TermDocs and DocIdSetIterator claim
     // that we must do next() before doc(). So we do, and if one of the
     // lists is empty, obviously return 0;
     if (p1 == null || p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
       return 0;
     }
     if (!p2.next()) {
       return 0;
     }

     int d1 = p1.docID();
     int d2 = p2.getDocID();

     int count = 0;
     for (;;) {
       if (d1 == d2) {
         ++count;
         if (p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
           break; // end of list 1, nothing more in intersection
         }
         d1 = p1.docID();
         if (!advance(p2, d1)) {
           break; // end of list 2, nothing more in intersection
         }
         d2 = p2.getDocID();
       } else if (d1 < d2) {
         if (p1.advance(d2) == DocIdSetIterator.NO_MORE_DOCS) {
           break; // end of list 1, nothing more in intersection
         }
         d1 = p1.docID();
       } else /* d1>d2 */ {
         if (!advance(p2, d1)) {
           break; // end of list 2, nothing more in intersection
         }
         d2 = p2.getDocID();
       }
     }
     return count;
   }

   /**
    * utility: advance the iterator until finding (or exceeding) specific
    * document
    *
    * @param iterator
    *          iterator being advanced
    * @param targetDoc
    *          target of advancing
    * @return false if iterator exhausted, true otherwise.
    */
   private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
     while (iterator.next()) {
       if (iterator.getDocID() >= targetDoc) {
         return true; // target reached
       }
     }
     return false; // exhausted
   }

 }
	package org.apache.lucene.facet.sampling;

	import java.io.IOException;

	import org.apache.lucene.facet.params.FacetSearchParams;
	import org.apache.lucene.facet.search.DrillDownQuery;
	import org.apache.lucene.facet.search.FacetResultNode;
	import org.apache.lucene.facet.search.ScoredDocIDs;
	import org.apache.lucene.facet.search.ScoredDocIDsIterator;
	import org.apache.lucene.facet.taxonomy.CategoryPath;
	import org.apache.lucene.facet.taxonomy.TaxonomyReader;
	import org.apache.lucene.index.DocsEnum;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.MultiFields;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.util.Bits;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* Fix sampling results by correct results, by counting the intersection between
	* two lists: a TermDocs (list of documents in a certain category) and a
	* DocIdSetIterator (list of documents matching the query).
	* <p>
	* This fixer is suitable for scenarios which prioritize accuracy over
	* performance.
	* <p>
	* <b>Note:</b> for statistically more accurate top-k selection, set
	* {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
	* least 2, so that the top-k categories would have better chance of showing up
	* in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
	*
	* @lucene.experimental
	*/
	public class TakmiSampleFixer extends SampleFixer {

	private TaxonomyReader taxonomyReader;
	private IndexReader indexReader;
	private FacetSearchParams searchParams;

	public TakmiSampleFixer(IndexReader indexReader,
	TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
	this.indexReader = indexReader;
	this.taxonomyReader = taxonomyReader;
	this.searchParams = searchParams;
	}

	@Override
	public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
	recount(facetResNode, docIds);
	}

	/**
	* Internal utility: recount for a facet result node
	*
	* @param fresNode
	* result node to be recounted
	* @param docIds
	* full set of matching documents.
	* @throws IOException If there is a low-level I/O error.
	*/
	private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException {
	// TODO (Facet): change from void to return the new, smaller docSet, and use
	// that for the children, as this will make their intersection ops faster.
	// can do this only when the new set is "sufficiently" smaller.

	/* We need the category's path name in order to do its recounting.
	* If it is missing, because the option to label only part of the
	* facet results was exercise, we need to calculate them anyway, so
	* in essence sampling with recounting spends some extra cycles for
	* labeling results for which labels are not required. */
	if (fresNode.label == null) {
	fresNode.label = taxonomyReader.getPath(fresNode.ordinal);
	}
	CategoryPath catPath = fresNode.label;

	Term drillDownTerm = DrillDownQuery.term(searchParams.indexingParams, catPath);
	// TODO (Facet): avoid Multi*?
	Bits liveDocs = MultiFields.getLiveDocs(indexReader);
	int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs,
	drillDownTerm.field(), drillDownTerm.bytes(),
	0), docIds.iterator());
	fresNode.value = updatedCount;
	}

	/**
	* Count the size of the intersection between two lists: a TermDocs (list of
	* documents in a certain category) and a DocIdSetIterator (list of documents
	* matching a query).
	*/
	private static int countIntersection(DocsEnum p1, ScoredDocIDsIterator p2)
	throws IOException {
	// The documentation of of both TermDocs and DocIdSetIterator claim
	// that we must do next() before doc(). So we do, and if one of the
	// lists is empty, obviously return 0;
	if (p1 == null \|\| p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
	return 0;
	}
	if (!p2.next()) {
	return 0;
	}

	int d1 = p1.docID();
	int d2 = p2.getDocID();

	int count = 0;
	for (;;) {
	if (d1 == d2) {
	++count;
	if (p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
	break; // end of list 1, nothing more in intersection
	}
	d1 = p1.docID();
	if (!advance(p2, d1)) {
	break; // end of list 2, nothing more in intersection
	}
	d2 = p2.getDocID();
	} else if (d1 < d2) {
	if (p1.advance(d2) == DocIdSetIterator.NO_MORE_DOCS) {
	break; // end of list 1, nothing more in intersection
	}
	d1 = p1.docID();
	} else /* d1>d2 */ {
	if (!advance(p2, d1)) {
	break; // end of list 2, nothing more in intersection
	}
	d2 = p2.getDocID();
	}
	}
	return count;
	}

	/**
	* utility: advance the iterator until finding (or exceeding) specific
	* document
	*
	* @param iterator
	* iterator being advanced
	* @param targetDoc
	* target of advancing
	* @return false if iterator exhausted, true otherwise.
	*/
	private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
	while (iterator.next()) {
	if (iterator.getDocID() >= targetDoc) {
	return true; // target reached
	}
	}
	return false; // exhausted
	}

	}