lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/PrefixTreeFacetCounter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.spatial.prefix;

 import java.io.IOException;
 import org.apache.lucene.index.IndexReaderContext;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.spatial.prefix.tree.Cell;
 import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
 import org.apache.lucene.util.Bits;
 import org.locationtech.spatial4j.shape.Shape;

 /**
  * Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
  *
  * <p><em>NOTE:</em> If for a given document and a given field using {@link
  * org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy} multiple values are indexed (i.e.
  * multi-valued) and at least one of them is a non-point, then there is a possibility of
  * double-counting the document in the facet results. Since each shape is independently turned into
  * grid cells at a resolution chosen by the shape's size, it's possible they will be indexed at
  * different resolutions. This means the document could be present in BOTH the postings for a cell
  * in both its prefix and leaf variants. To avoid this, use a single valued field with a {@link
  * org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent). Or calculate a suitable
  * level/distErr to index both and call {@link
  * org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape,
  * int)} with the same value for all shapes for a given document/field.
  *
  * @lucene.experimental
  */
 public class PrefixTreeFacetCounter {

   /** A callback/visitor of facet counts. */
   public abstract static class FacetVisitor {
     /** Called at the start of the segment, if there is indexed data. */
     public void startOfSegment() {}

     /**
      * Called for cells with a leaf, or cells at the target facet level. {@code count} is greater
      * than zero. When an ancestor cell is given with non-zero count, the count can be considered to
      * be added to all cells below. You won't necessarily get a cell at level {@code facetLevel} if
      * the indexed data is courser (bigger).
      */
     public abstract void visit(Cell cell, int count);
   }

   private PrefixTreeFacetCounter() {}

   /**
    * Computes facets using a callback/visitor style design, allowing flexibility for the caller to
    * determine what to do with each underlying count.
    *
    * @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
    * @param context the IndexReader's context
    * @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted.
    * @param queryShape the shape to limit the range of facet counts to
    * @param facetLevel the maximum depth (detail) of faceted cells
    * @param facetVisitor the visitor/callback to receive the counts
    */
   public static void compute(
       PrefixTreeStrategy strategy,
       IndexReaderContext context,
       Bits topAcceptDocs,
       Shape queryShape,
       int facetLevel,
       FacetVisitor facetVisitor)
       throws IOException {
     // We collect per-leaf
     for (final LeafReaderContext leafCtx : context.leaves()) {
       // determine leaf acceptDocs Bits
       Bits leafAcceptDocs;
       if (topAcceptDocs == null) {
         leafAcceptDocs = leafCtx.reader().getLiveDocs(); // filter deleted
       } else {
         leafAcceptDocs =
             new Bits() {
               @Override
               public boolean get(int index) {
                 return topAcceptDocs.get(leafCtx.docBase + index);
               }

               @Override
               public int length() {
                 return leafCtx.reader().maxDoc();
               }
             };
       }

       compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
     }
   }

   /** Lower-level per-leaf segment method. */
   public static void compute(
       final PrefixTreeStrategy strategy,
       final LeafReaderContext context,
       final Bits acceptDocs,
       final Shape queryShape,
       final int facetLevel,
       final FacetVisitor facetVisitor)
       throws IOException {
     if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
       throw new IllegalArgumentException(
           "acceptDocs bits length "
               + acceptDocs.length()
               + " != leaf maxdoc "
               + context.reader().maxDoc());
     }
     final SpatialPrefixTree tree = strategy.getGrid();

     // scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
     // another scanLevel would be much faster and it tends to be a risky knob (can help a little,
     // can hurt a ton).
     // TODO use RPT's configured scan level?  Do we know better here?  Hard to say.
     final int scanLevel = tree.getMaxLevels();
     // AbstractVisitingPrefixTreeFilter is a Lucene Filter.  We don't need a filter; we use it for
     // its great prefix-tree
     // traversal code.  TODO consider refactoring if/when it makes sense (more use cases than this)
     new AbstractVisitingPrefixTreeQuery(
         queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {

       @Override
       public String toString(String field) {
         return "anonPrefixTreeQuery"; // un-used
       }

       @Override
       public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException {
         assert facetLevel == super.detailLevel; // same thing, FYI. (constant)

         return new VisitorTemplate(context) {

           @Override
           protected void start() throws IOException {
             facetVisitor.startOfSegment();
           }

           @Override
           protected DocIdSet finish() throws IOException {
             return null; // unused;
           }

           @Override
           protected boolean visitPrefix(Cell cell) throws IOException {
             // At facetLevel...
             if (cell.getLevel() == facetLevel) {
               // Count docs
               visitLeaf(cell); // we're not a leaf but we treat it as such at facet level
               return false; // don't descend further; this is enough detail
             }

             // We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if
             // no
             // matching docs. We could do this at all levels or never but the closer we get to the
             // facet level, the
             // higher the probability this is worthwhile. We do when docFreq == 1 because it's a
             // cheap check, especially
             // due to "pulsing" in the codec.
             // TODO this opt should move to VisitorTemplate (which contains an optimization TODO to
             // this effect)
             if (cell.getLevel() == facetLevel - 1 || termsEnum.docFreq() == 1) {
               if (!hasDocsAtThisTerm()) {
                 return false;
               }
             }
             return true;
           }

           @Override
           protected void visitLeaf(Cell cell) throws IOException {
             final int count = countDocsAtThisTerm();
             if (count > 0) {
               facetVisitor.visit(cell, count);
             }
           }

           private int countDocsAtThisTerm() throws IOException {
             if (acceptDocs == null) {
               return termsEnum.docFreq();
             }
             int count = 0;
             postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
             while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
               if (acceptDocs.get(postingsEnum.docID()) == false) {
                 continue;
               }
               count++;
             }
             return count;
           }

           private boolean hasDocsAtThisTerm() throws IOException {
             if (acceptDocs == null) {
               return true;
             }
             postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
             int nextDoc = postingsEnum.nextDoc();
             while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) {
               nextDoc = postingsEnum.nextDoc();
             }
             return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
           }
         }.getDocIdSet();
       }
     }.getDocIdSet(context);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.spatial.prefix;

	import java.io.IOException;
	import org.apache.lucene.index.IndexReaderContext;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.search.DocIdSet;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.spatial.prefix.tree.Cell;
	import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
	import org.apache.lucene.util.Bits;
	import org.locationtech.spatial4j.shape.Shape;

	/**
	* Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
	*
	* <p><em>NOTE:</em> If for a given document and a given field using {@link
	* org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy} multiple values are indexed (i.e.
	* multi-valued) and at least one of them is a non-point, then there is a possibility of
	* double-counting the document in the facet results. Since each shape is independently turned into
	* grid cells at a resolution chosen by the shape's size, it's possible they will be indexed at
	* different resolutions. This means the document could be present in BOTH the postings for a cell
	* in both its prefix and leaf variants. To avoid this, use a single valued field with a {@link
	* org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent). Or calculate a suitable
	* level/distErr to index both and call {@link
	* org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape,
	* int)} with the same value for all shapes for a given document/field.
	*
	* @lucene.experimental
	*/
	public class PrefixTreeFacetCounter {

	/** A callback/visitor of facet counts. */
	public abstract static class FacetVisitor {
	/** Called at the start of the segment, if there is indexed data. */
	public void startOfSegment() {}

	/**
	* Called for cells with a leaf, or cells at the target facet level. {@code count} is greater
	* than zero. When an ancestor cell is given with non-zero count, the count can be considered to
	* be added to all cells below. You won't necessarily get a cell at level {@code facetLevel} if
	* the indexed data is courser (bigger).
	*/
	public abstract void visit(Cell cell, int count);
	}

	private PrefixTreeFacetCounter() {}

	/**
	* Computes facets using a callback/visitor style design, allowing flexibility for the caller to
	* determine what to do with each underlying count.
	*
	* @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
	* @param context the IndexReader's context
	* @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted.
	* @param queryShape the shape to limit the range of facet counts to
	* @param facetLevel the maximum depth (detail) of faceted cells
	* @param facetVisitor the visitor/callback to receive the counts
	*/
	public static void compute(
	PrefixTreeStrategy strategy,
	IndexReaderContext context,
	Bits topAcceptDocs,
	Shape queryShape,
	int facetLevel,
	FacetVisitor facetVisitor)
	throws IOException {
	// We collect per-leaf
	for (final LeafReaderContext leafCtx : context.leaves()) {
	// determine leaf acceptDocs Bits
	Bits leafAcceptDocs;
	if (topAcceptDocs == null) {
	leafAcceptDocs = leafCtx.reader().getLiveDocs(); // filter deleted
	} else {
	leafAcceptDocs =
	new Bits() {
	@Override
	public boolean get(int index) {
	return topAcceptDocs.get(leafCtx.docBase + index);
	}

	@Override
	public int length() {
	return leafCtx.reader().maxDoc();
	}
	};
	}

	compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
	}
	}

	/** Lower-level per-leaf segment method. */
	public static void compute(
	final PrefixTreeStrategy strategy,
	final LeafReaderContext context,
	final Bits acceptDocs,
	final Shape queryShape,
	final int facetLevel,
	final FacetVisitor facetVisitor)
	throws IOException {
	if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
	throw new IllegalArgumentException(
	"acceptDocs bits length "
	+ acceptDocs.length()
	+ " != leaf maxdoc "
	+ context.reader().maxDoc());
	}
	final SpatialPrefixTree tree = strategy.getGrid();

	// scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
	// another scanLevel would be much faster and it tends to be a risky knob (can help a little,
	// can hurt a ton).
	// TODO use RPT's configured scan level? Do we know better here? Hard to say.
	final int scanLevel = tree.getMaxLevels();
	// AbstractVisitingPrefixTreeFilter is a Lucene Filter. We don't need a filter; we use it for
	// its great prefix-tree
	// traversal code. TODO consider refactoring if/when it makes sense (more use cases than this)
	new AbstractVisitingPrefixTreeQuery(
	queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {

	@Override
	public String toString(String field) {
	return "anonPrefixTreeQuery"; // un-used
	}

	@Override
	public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException {
	assert facetLevel == super.detailLevel; // same thing, FYI. (constant)

	return new VisitorTemplate(context) {

	@Override
	protected void start() throws IOException {
	facetVisitor.startOfSegment();
	}

	@Override
	protected DocIdSet finish() throws IOException {
	return null; // unused;
	}

	@Override
	protected boolean visitPrefix(Cell cell) throws IOException {
	// At facetLevel...
	if (cell.getLevel() == facetLevel) {
	// Count docs
	visitLeaf(cell); // we're not a leaf but we treat it as such at facet level
	return false; // don't descend further; this is enough detail
	}

	// We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if
	// no
	// matching docs. We could do this at all levels or never but the closer we get to the
	// facet level, the
	// higher the probability this is worthwhile. We do when docFreq == 1 because it's a
	// cheap check, especially
	// due to "pulsing" in the codec.
	// TODO this opt should move to VisitorTemplate (which contains an optimization TODO to
	// this effect)
	if (cell.getLevel() == facetLevel - 1 \|\| termsEnum.docFreq() == 1) {
	if (!hasDocsAtThisTerm()) {
	return false;
	}
	}
	return true;
	}

	@Override
	protected void visitLeaf(Cell cell) throws IOException {
	final int count = countDocsAtThisTerm();
	if (count > 0) {
	facetVisitor.visit(cell, count);
	}
	}

	private int countDocsAtThisTerm() throws IOException {
	if (acceptDocs == null) {
	return termsEnum.docFreq();
	}
	int count = 0;
	postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
	while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
	if (acceptDocs.get(postingsEnum.docID()) == false) {
	continue;
	}
	count++;
	}
	return count;
	}

	private boolean hasDocsAtThisTerm() throws IOException {
	if (acceptDocs == null) {
	return true;
	}
	postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
	int nextDoc = postingsEnum.nextDoc();
	while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) {
	nextDoc = postingsEnum.nextDoc();
	}
	return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
	}
	}.getDocIdSet();
	}
	}.getDocIdSet(context);
	}
	}