| package org.apache.lucene.facet.sampling; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.facet.params.FacetSearchParams; |
| import org.apache.lucene.facet.search.DrillDownQuery; |
| import org.apache.lucene.facet.search.FacetResultNode; |
| import org.apache.lucene.facet.search.ScoredDocIDs; |
| import org.apache.lucene.facet.search.ScoredDocIDsIterator; |
| import org.apache.lucene.facet.taxonomy.CategoryPath; |
| import org.apache.lucene.facet.taxonomy.TaxonomyReader; |
| import org.apache.lucene.index.DocsEnum; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.MultiFields; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.util.Bits; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /** |
| * Fix sampling results by correct results, by counting the intersection between |
| * two lists: a TermDocs (list of documents in a certain category) and a |
| * DocIdSetIterator (list of documents matching the query). |
| * <p> |
| * This fixer is suitable for scenarios which prioritize accuracy over |
| * performance. |
| * <p> |
| * <b>Note:</b> for statistically more accurate top-k selection, set |
| * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at |
| * least 2, so that the top-k categories would have better chance of showing up |
| * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor} |
| * |
| * @lucene.experimental |
| */ |
| public class TakmiSampleFixer extends SampleFixer { |
| |
| private TaxonomyReader taxonomyReader; |
| private IndexReader indexReader; |
| private FacetSearchParams searchParams; |
| |
| public TakmiSampleFixer(IndexReader indexReader, |
| TaxonomyReader taxonomyReader, FacetSearchParams searchParams) { |
| this.indexReader = indexReader; |
| this.taxonomyReader = taxonomyReader; |
| this.searchParams = searchParams; |
| } |
| |
| @Override |
| public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException { |
| recount(facetResNode, docIds); |
| } |
| |
| /** |
| * Internal utility: recount for a facet result node |
| * |
| * @param fresNode |
| * result node to be recounted |
| * @param docIds |
| * full set of matching documents. |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException { |
| // TODO (Facet): change from void to return the new, smaller docSet, and use |
| // that for the children, as this will make their intersection ops faster. |
| // can do this only when the new set is "sufficiently" smaller. |
| |
| /* We need the category's path name in order to do its recounting. |
| * If it is missing, because the option to label only part of the |
| * facet results was exercise, we need to calculate them anyway, so |
| * in essence sampling with recounting spends some extra cycles for |
| * labeling results for which labels are not required. */ |
| if (fresNode.label == null) { |
| fresNode.label = taxonomyReader.getPath(fresNode.ordinal); |
| } |
| CategoryPath catPath = fresNode.label; |
| |
| Term drillDownTerm = DrillDownQuery.term(searchParams.indexingParams, catPath); |
| // TODO (Facet): avoid Multi*? |
| Bits liveDocs = MultiFields.getLiveDocs(indexReader); |
| int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs, |
| drillDownTerm.field(), drillDownTerm.bytes(), |
| 0), docIds.iterator()); |
| fresNode.value = updatedCount; |
| } |
| |
| /** |
| * Count the size of the intersection between two lists: a TermDocs (list of |
| * documents in a certain category) and a DocIdSetIterator (list of documents |
| * matching a query). |
| */ |
| private static int countIntersection(DocsEnum p1, ScoredDocIDsIterator p2) |
| throws IOException { |
| // The documentation of of both TermDocs and DocIdSetIterator claim |
| // that we must do next() before doc(). So we do, and if one of the |
| // lists is empty, obviously return 0; |
| if (p1 == null || p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { |
| return 0; |
| } |
| if (!p2.next()) { |
| return 0; |
| } |
| |
| int d1 = p1.docID(); |
| int d2 = p2.getDocID(); |
| |
| int count = 0; |
| for (;;) { |
| if (d1 == d2) { |
| ++count; |
| if (p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { |
| break; // end of list 1, nothing more in intersection |
| } |
| d1 = p1.docID(); |
| if (!advance(p2, d1)) { |
| break; // end of list 2, nothing more in intersection |
| } |
| d2 = p2.getDocID(); |
| } else if (d1 < d2) { |
| if (p1.advance(d2) == DocIdSetIterator.NO_MORE_DOCS) { |
| break; // end of list 1, nothing more in intersection |
| } |
| d1 = p1.docID(); |
| } else /* d1>d2 */ { |
| if (!advance(p2, d1)) { |
| break; // end of list 2, nothing more in intersection |
| } |
| d2 = p2.getDocID(); |
| } |
| } |
| return count; |
| } |
| |
| /** |
| * utility: advance the iterator until finding (or exceeding) specific |
| * document |
| * |
| * @param iterator |
| * iterator being advanced |
| * @param targetDoc |
| * target of advancing |
| * @return false if iterator exhausted, true otherwise. |
| */ |
| private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) { |
| while (iterator.next()) { |
| if (iterator.getDocID() >= targetDoc) { |
| return true; // target reached |
| } |
| } |
| return false; // exhausted |
| } |
| |
| } |