lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/PrefixTreeFacetCounter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.spatial.prefix;

 import java.io.IOException;

 import org.locationtech.spatial4j.shape.Shape;
 import org.apache.lucene.index.IndexReaderContext;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.spatial.prefix.tree.Cell;
 import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
 import org.apache.lucene.util.Bits;

 /**
  * Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
  * <p>
  * <em>NOTE:</em> If for a given document and a given field using
  * {@link org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy}
  * multiple values are indexed (i.e. multi-valued) and at least one of them is a non-point, then there is a possibility
  * of double-counting the document in the facet results.  Since each shape is independently turned into grid cells at
  * a resolution chosen by the shape's size, it's possible they will be indexed at different resolutions.  This means
  * the document could be present in BOTH the postings for a cell in both its prefix and leaf variants.  To avoid this,
  * use a single valued field with a {@link org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent).  Or
  * calculate a suitable level/distErr to index both and call
  * {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape, int)}
  * with the same value for all shapes for a given document/field.
  *
  * @lucene.experimental
  */
 public class PrefixTreeFacetCounter {

   /** A callback/visitor of facet counts. */
   public static abstract class FacetVisitor {
     /** Called at the start of the segment, if there is indexed data. */
     public void startOfSegment() {}

     /** Called for cells with a leaf, or cells at the target facet level.  {@code count} is greater than zero.
      * When an ancestor cell is given with non-zero count, the count can be considered to be added to all cells
      * below. You won't necessarily get a cell at level {@code facetLevel} if the indexed data is courser (bigger).
      */
     public abstract void visit(Cell cell, int count);
   }

   private PrefixTreeFacetCounter() {
   }

   /**
    * Computes facets using a callback/visitor style design, allowing flexibility for the caller to determine what to do
    * with each underlying count.
    * @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
    * @param context the IndexReader's context
    * @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted.
    * @param queryShape the shape to limit the range of facet counts to
    * @param facetLevel the maximum depth (detail) of faceted cells
    * @param facetVisitor the visitor/callback to receive the counts
    */
   public static void compute(PrefixTreeStrategy strategy, IndexReaderContext context, Bits topAcceptDocs,
                              Shape queryShape, int facetLevel, FacetVisitor facetVisitor)
       throws IOException {
     //We collect per-leaf
     for (final LeafReaderContext leafCtx : context.leaves()) {
       //determine leaf acceptDocs Bits
       Bits leafAcceptDocs;
       if (topAcceptDocs == null) {
         leafAcceptDocs = leafCtx.reader().getLiveDocs();//filter deleted
       } else {
         leafAcceptDocs = new Bits() {
           @Override
           public boolean get(int index) {
             return topAcceptDocs.get(leafCtx.docBase + index);
           }

           @Override
           public int length() {
             return leafCtx.reader().maxDoc();
           }
         };
       }

       compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
     }
   }

   /** Lower-level per-leaf segment method. */
   public static void compute(final PrefixTreeStrategy strategy, final LeafReaderContext context, final Bits acceptDocs,
                              final Shape queryShape, final int facetLevel, final FacetVisitor facetVisitor)
       throws IOException {
     if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
       throw new IllegalArgumentException(
           "acceptDocs bits length " + acceptDocs.length() +" != leaf maxdoc " + context.reader().maxDoc());
     }
     final SpatialPrefixTree tree = strategy.getGrid();

     //scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
     // another scanLevel would be much faster and it tends to be a risky knob (can help a little, can hurt a ton).
     // TODO use RPT's configured scan level?  Do we know better here?  Hard to say.
     final int scanLevel = tree.getMaxLevels();
     //AbstractVisitingPrefixTreeFilter is a Lucene Filter.  We don't need a filter; we use it for its great prefix-tree
     // traversal code.  TODO consider refactoring if/when it makes sense (more use cases than this)
     new AbstractVisitingPrefixTreeQuery(queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {

       @Override
       public String toString(String field) {
         return "anonPrefixTreeQuery";//un-used
       }

       @Override
       public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException {
         assert facetLevel == super.detailLevel;//same thing, FYI. (constant)

         return new VisitorTemplate(context) {

           @Override
           protected void start() throws IOException {
             facetVisitor.startOfSegment();
           }

           @Override
           protected DocIdSet finish() throws IOException {
             return null;//unused;
           }

           @Override
           protected boolean visitPrefix(Cell cell) throws IOException {
             // At facetLevel...
             if (cell.getLevel() == facetLevel) {
               // Count docs
               visitLeaf(cell);//we're not a leaf but we treat it as such at facet level
               return false;//don't descend further; this is enough detail
             }

             // We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if no
             // matching docs. We could do this at all levels or never but the closer we get to the facet level, the
             // higher the probability this is worthwhile. We do when docFreq == 1 because it's a cheap check, especially
             // due to "pulsing" in the codec.
             //TODO this opt should move to VisitorTemplate (which contains an optimization TODO to this effect)
             if (cell.getLevel() == facetLevel - 1 || termsEnum.docFreq() == 1) {
               if (!hasDocsAtThisTerm()) {
                 return false;
               }
             }
             return true;
           }

           @Override
           protected void visitLeaf(Cell cell) throws IOException {
             final int count = countDocsAtThisTerm();
             if (count > 0) {
               facetVisitor.visit(cell, count);
             }
           }

           private int countDocsAtThisTerm() throws IOException {
             if (acceptDocs == null) {
               return termsEnum.docFreq();
             }
             int count = 0;
             postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
             while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
               if (acceptDocs.get(postingsEnum.docID()) == false) {
                 continue;
               }
               count++;
             }
             return count;
           }

           private boolean hasDocsAtThisTerm() throws IOException {
             if (acceptDocs == null) {
               return true;
             }
             postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
             int nextDoc = postingsEnum.nextDoc();
             while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) {
               nextDoc = postingsEnum.nextDoc();
             }
             return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
           }

         }.getDocIdSet();
       }
     }.getDocIdSet(context);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.spatial.prefix;

	import java.io.IOException;

	import org.locationtech.spatial4j.shape.Shape;
	import org.apache.lucene.index.IndexReaderContext;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.search.DocIdSet;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.spatial.prefix.tree.Cell;
	import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
	import org.apache.lucene.util.Bits;

	/**
	* Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
	* <p>
	* <em>NOTE:</em> If for a given document and a given field using
	* {@link org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy}
	* multiple values are indexed (i.e. multi-valued) and at least one of them is a non-point, then there is a possibility
	* of double-counting the document in the facet results. Since each shape is independently turned into grid cells at
	* a resolution chosen by the shape's size, it's possible they will be indexed at different resolutions. This means
	* the document could be present in BOTH the postings for a cell in both its prefix and leaf variants. To avoid this,
	* use a single valued field with a {@link org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent). Or
	* calculate a suitable level/distErr to index both and call
	* {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape, int)}
	* with the same value for all shapes for a given document/field.
	*
	* @lucene.experimental
	*/
	public class PrefixTreeFacetCounter {

	/** A callback/visitor of facet counts. */
	public static abstract class FacetVisitor {
	/** Called at the start of the segment, if there is indexed data. */
	public void startOfSegment() {}

	/** Called for cells with a leaf, or cells at the target facet level. {@code count} is greater than zero.
	* When an ancestor cell is given with non-zero count, the count can be considered to be added to all cells
	* below. You won't necessarily get a cell at level {@code facetLevel} if the indexed data is courser (bigger).
	*/
	public abstract void visit(Cell cell, int count);
	}

	private PrefixTreeFacetCounter() {
	}

	/**
	* Computes facets using a callback/visitor style design, allowing flexibility for the caller to determine what to do
	* with each underlying count.
	* @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
	* @param context the IndexReader's context
	* @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted.
	* @param queryShape the shape to limit the range of facet counts to
	* @param facetLevel the maximum depth (detail) of faceted cells
	* @param facetVisitor the visitor/callback to receive the counts
	*/
	public static void compute(PrefixTreeStrategy strategy, IndexReaderContext context, Bits topAcceptDocs,
	Shape queryShape, int facetLevel, FacetVisitor facetVisitor)
	throws IOException {
	//We collect per-leaf
	for (final LeafReaderContext leafCtx : context.leaves()) {
	//determine leaf acceptDocs Bits
	Bits leafAcceptDocs;
	if (topAcceptDocs == null) {
	leafAcceptDocs = leafCtx.reader().getLiveDocs();//filter deleted
	} else {
	leafAcceptDocs = new Bits() {
	@Override
	public boolean get(int index) {
	return topAcceptDocs.get(leafCtx.docBase + index);
	}

	@Override
	public int length() {
	return leafCtx.reader().maxDoc();
	}
	};
	}

	compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
	}
	}

	/** Lower-level per-leaf segment method. */
	public static void compute(final PrefixTreeStrategy strategy, final LeafReaderContext context, final Bits acceptDocs,
	final Shape queryShape, final int facetLevel, final FacetVisitor facetVisitor)
	throws IOException {
	if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
	throw new IllegalArgumentException(
	"acceptDocs bits length " + acceptDocs.length() +" != leaf maxdoc " + context.reader().maxDoc());
	}
	final SpatialPrefixTree tree = strategy.getGrid();

	//scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
	// another scanLevel would be much faster and it tends to be a risky knob (can help a little, can hurt a ton).
	// TODO use RPT's configured scan level? Do we know better here? Hard to say.
	final int scanLevel = tree.getMaxLevels();
	//AbstractVisitingPrefixTreeFilter is a Lucene Filter. We don't need a filter; we use it for its great prefix-tree
	// traversal code. TODO consider refactoring if/when it makes sense (more use cases than this)
	new AbstractVisitingPrefixTreeQuery(queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {

	@Override
	public String toString(String field) {
	return "anonPrefixTreeQuery";//un-used
	}

	@Override
	public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException {
	assert facetLevel == super.detailLevel;//same thing, FYI. (constant)

	return new VisitorTemplate(context) {

	@Override
	protected void start() throws IOException {
	facetVisitor.startOfSegment();
	}

	@Override
	protected DocIdSet finish() throws IOException {
	return null;//unused;
	}

	@Override
	protected boolean visitPrefix(Cell cell) throws IOException {
	// At facetLevel...
	if (cell.getLevel() == facetLevel) {
	// Count docs
	visitLeaf(cell);//we're not a leaf but we treat it as such at facet level
	return false;//don't descend further; this is enough detail
	}

	// We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if no
	// matching docs. We could do this at all levels or never but the closer we get to the facet level, the
	// higher the probability this is worthwhile. We do when docFreq == 1 because it's a cheap check, especially
	// due to "pulsing" in the codec.
	//TODO this opt should move to VisitorTemplate (which contains an optimization TODO to this effect)
	if (cell.getLevel() == facetLevel - 1 \|\| termsEnum.docFreq() == 1) {
	if (!hasDocsAtThisTerm()) {
	return false;
	}
	}
	return true;
	}

	@Override
	protected void visitLeaf(Cell cell) throws IOException {
	final int count = countDocsAtThisTerm();
	if (count > 0) {
	facetVisitor.visit(cell, count);
	}
	}

	private int countDocsAtThisTerm() throws IOException {
	if (acceptDocs == null) {
	return termsEnum.docFreq();
	}
	int count = 0;
	postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
	while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
	if (acceptDocs.get(postingsEnum.docID()) == false) {
	continue;
	}
	count++;
	}
	return count;
	}

	private boolean hasDocsAtThisTerm() throws IOException {
	if (acceptDocs == null) {
	return true;
	}
	postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
	int nextDoc = postingsEnum.nextDoc();
	while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) {
	nextDoc = postingsEnum.nextDoc();
	}
	return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
	}

	}.getDocIdSet();
	}
	}.getDocIdSet(context);
	}
	}