lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java - lucene-solr - Git at Google

 package org.apache.lucene.util.bkd;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.Arrays;

 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.RamUsageEstimator;

 /** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}.
  *
  * @lucene.experimental */

 public final class BKDReader implements Accountable {
   // Packed array of byte[] holding all split values in the full binary tree:
   final private byte[] splitPackedValues;
   final private long[] leafBlockFPs;
   final private int leafNodeOffset;
   final int numDims;
   final int bytesPerDim;
   final IndexInput in;
   final int packedBytesLength;
   final int maxPointsInLeafNode;

   enum Relation {CELL_INSIDE_QUERY, QUERY_CROSSES_CELL, QUERY_OUTSIDE_CELL};

   /** We recurse the BKD tree, using a provided instance of this to guide the recursion.
    *
    * @lucene.experimental */
   public interface IntersectVisitor {
     /** Called for all docs in a leaf cell that's fully contained by the query.  The
      *  consumer should blindly accept the docID. */
     void visit(int docID);

     /** Called for all docs in a leaf cell that crosses the query.  The consumer
      *  should scrutinize the packedValue to decide whether to accept it. */
     void visit(int docID, byte[] packedValue);

     /** Called for non-leaf cells to test how the cell relates to the query, to
      *  determine how to further recurse down the treer. */
     Relation compare(byte[] minPackedValue, byte[] maxPackedValue);
   }

   /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
   public BKDReader(IndexInput in) throws IOException {

     CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START);
     numDims = in.readVInt();
     maxPointsInLeafNode = in.readVInt();
     bytesPerDim = in.readVInt();
     packedBytesLength = numDims * bytesPerDim;

     // Read index:
     int numLeaves = in.readVInt();
     leafNodeOffset = numLeaves;

     splitPackedValues = new byte[(1+bytesPerDim)*numLeaves];
     in.readBytes(splitPackedValues, 0, splitPackedValues.length);

     // Tree is fully balanced binary tree, so number of nodes = numLeaves-1, except our nodeIDs are 1-based (splitPackedValues[0] is unused):
     leafBlockFPs = new long[numLeaves];
     for(int i=0;i<numLeaves;i++) {
       leafBlockFPs[i] = in.readVLong();
     }

     this.in = in;
   }

   private static final class IntersectState {
     final IndexInput in;
     final int[] scratchDocIDs;
     final byte[] scratchPackedValue;

     // Minimum point of the N-dim rect containing the query shape:
     final byte[] minPacked;
     // Maximum point of the N-dim rect containing the query shape:
     final byte[] maxPacked;
     final IntersectVisitor visitor;

     public IntersectState(IndexInput in, int packedBytesLength,
                           int maxPointsInLeafNode, byte[] minPacked, byte[] maxPacked,
                           IntersectVisitor visitor) {
       this.in = in;
       this.minPacked = minPacked;
       this.maxPacked = maxPacked;
       this.visitor = visitor;
       this.scratchDocIDs = new int[maxPointsInLeafNode];
       this.scratchPackedValue = new byte[packedBytesLength];
     }
   }

   public void intersect(IntersectVisitor visitor) throws IOException {
     byte[] minPacked = new byte[packedBytesLength];
     byte[] maxPacked = new byte[packedBytesLength];
     Arrays.fill(maxPacked, (byte) 0xff);
     intersect(minPacked, maxPacked, visitor);
   }

   public void intersect(byte[] minPacked, byte[] maxPacked, IntersectVisitor visitor) throws IOException {
     IntersectState state = new IntersectState(in.clone(), packedBytesLength,
                                               maxPointsInLeafNode, minPacked, maxPacked,
                                               visitor);
     byte[] rootMinPacked = new byte[packedBytesLength];
     byte[] rootMaxPacked = new byte[packedBytesLength];
     Arrays.fill(rootMaxPacked, (byte) 0xff);
     intersect(state, 1, rootMinPacked, rootMaxPacked);
   }

   /** Fast path: this is called when the query box fully encompasses all cells under this node. */
   private void addAll(IntersectState state, int nodeID) throws IOException {
     //System.out.println("R: addAll nodeID=" + nodeID);

     if (nodeID >= leafNodeOffset) {
       //System.out.println("R:   leaf");

       // Leaf node
       state.in.seek(leafBlockFPs[nodeID-leafNodeOffset]);

       // How many points are stored in this leaf cell:
       int count = state.in.readVInt();

       // TODO: especially for the 1D case, this was a decent speedup, because caller could know it should budget for around XXX docs:
       //state.docs.grow(count);
       int docID = 0;
       for(int i=0;i<count;i++) {
         docID += state.in.readVInt();
         state.visitor.visit(docID);
       }
     } else {
       addAll(state, 2*nodeID);
       addAll(state, 2*nodeID+1);
     }
   }

   private void intersect(IntersectState state,
                         int nodeID,
                         byte[] cellMinPacked, byte[] cellMaxPacked)
     throws IOException {

     //System.out.println("\nR: intersect nodeID=" + nodeID + " cellMin=" + BKDUtil.bytesToInt(cellMinPacked, 0) + " cellMax=" + BKDUtil.bytesToInt(cellMaxPacked, 0));

     // Optimization: only check the visitor when the current cell does not fully contain the bbox.  E.g. if the
     // query is a small area around London, UK, most of the high nodes in the BKD tree as we recurse will fully
     // contain the query, so we quickly recurse down until the nodes cross the query:
     boolean cellContainsQuery = BKDUtil.contains(bytesPerDim,
                                                  cellMinPacked, cellMaxPacked,
                                                  state.minPacked, state.maxPacked);

     //System.out.println("R: cellContainsQuery=" + cellContainsQuery);

     if (cellContainsQuery == false) {

       Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
       //System.out.println("R: relation=" + r);

       if (r == Relation.QUERY_OUTSIDE_CELL) {
         // This cell is fully outside of the query shape: stop recursing
         return;
       } else if (r == Relation.CELL_INSIDE_QUERY) {
         // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
         addAll(state, nodeID);
         return;
       } else {
         // The cell crosses the shape boundary, so we fall through and do full filtering
       }
     }

     if (nodeID >= leafNodeOffset) {
       // Leaf node; scan and filter all points in this block:
       //System.out.println("    intersect leaf nodeID=" + nodeID + " vs leafNodeOffset=" + leafNodeOffset + " fp=" + leafBlockFPs[nodeID-leafNodeOffset]);

       state.in.seek(leafBlockFPs[nodeID-leafNodeOffset]);

       // How many points are stored in this leaf cell:
       int count = state.in.readVInt();

       // TODO: we could maybe pollute the IntersectVisitor API with a "grow" method if this maybe helps perf
       // enough (it did before, esp. for the 1D case):
       //state.docs.grow(count);
       int docID = 0;
       for(int i=0;i<count;i++) {
         docID += state.in.readVInt();
         state.scratchDocIDs[i] = docID;
       }

       // Again, this time reading values and checking with the visitor
       for(int i=0;i<count;i++) {
         state.in.readBytes(state.scratchPackedValue, 0, state.scratchPackedValue.length);
         state.visitor.visit(state.scratchDocIDs[i], state.scratchPackedValue);
       }

     } else {

       // Non-leaf node: recurse on the split left and right nodes

       int address = nodeID * (bytesPerDim+1);
       int splitDim = splitPackedValues[address] & 0xff;
       assert splitDim < numDims;

       // TODO: can we alloc & reuse this up front?
       byte[] splitValue = new byte[bytesPerDim];
       System.arraycopy(splitPackedValues, address+1, splitValue, 0, bytesPerDim);

       // TODO: can we alloc & reuse this up front?
       byte[] splitPackedValue = new byte[packedBytesLength];

       if (BKDUtil.compare(bytesPerDim, state.minPacked, splitDim, splitValue, 0) <= 0) {
         // The query bbox overlaps our left cell, so we must recurse:
         System.arraycopy(state.maxPacked, 0, splitPackedValue, 0, packedBytesLength);
         System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
         intersect(state,
                   2*nodeID,
                   cellMinPacked, splitPackedValue);
       }

       if (BKDUtil.compare(bytesPerDim, state.maxPacked, splitDim, splitValue, 0) >= 0) {
         // The query bbox overlaps our left cell, so we must recurse:
         System.arraycopy(state.minPacked, 0, splitPackedValue, 0, packedBytesLength);
         System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
         intersect(state,
                   2*nodeID+1,
                   splitPackedValue, cellMaxPacked);
       }
     }
   }

   @Override
   public long ramBytesUsed() {
     return splitPackedValues.length +
       leafBlockFPs.length * RamUsageEstimator.NUM_BYTES_LONG;
   }
 }
	package org.apache.lucene.util.bkd;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.Arrays;

	import org.apache.lucene.codecs.CodecUtil;
	import org.apache.lucene.store.IndexInput;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.RamUsageEstimator;

	/** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}.
	*
	* @lucene.experimental */

	public final class BKDReader implements Accountable {
	// Packed array of byte[] holding all split values in the full binary tree:
	final private byte[] splitPackedValues;
	final private long[] leafBlockFPs;
	final private int leafNodeOffset;
	final int numDims;
	final int bytesPerDim;
	final IndexInput in;
	final int packedBytesLength;
	final int maxPointsInLeafNode;

	enum Relation {CELL_INSIDE_QUERY, QUERY_CROSSES_CELL, QUERY_OUTSIDE_CELL};

	/** We recurse the BKD tree, using a provided instance of this to guide the recursion.
	*
	* @lucene.experimental */
	public interface IntersectVisitor {
	/** Called for all docs in a leaf cell that's fully contained by the query. The
	* consumer should blindly accept the docID. */
	void visit(int docID);

	/** Called for all docs in a leaf cell that crosses the query. The consumer
	* should scrutinize the packedValue to decide whether to accept it. */
	void visit(int docID, byte[] packedValue);

	/** Called for non-leaf cells to test how the cell relates to the query, to
	* determine how to further recurse down the treer. */
	Relation compare(byte[] minPackedValue, byte[] maxPackedValue);
	}

	/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
	public BKDReader(IndexInput in) throws IOException {

	CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START);
	numDims = in.readVInt();
	maxPointsInLeafNode = in.readVInt();
	bytesPerDim = in.readVInt();
	packedBytesLength = numDims * bytesPerDim;

	// Read index:
	int numLeaves = in.readVInt();
	leafNodeOffset = numLeaves;

	splitPackedValues = new byte[(1+bytesPerDim)*numLeaves];
	in.readBytes(splitPackedValues, 0, splitPackedValues.length);

	// Tree is fully balanced binary tree, so number of nodes = numLeaves-1, except our nodeIDs are 1-based (splitPackedValues[0] is unused):
	leafBlockFPs = new long[numLeaves];
	for(int i=0;i<numLeaves;i++) {
	leafBlockFPs[i] = in.readVLong();
	}

	this.in = in;
	}

	private static final class IntersectState {
	final IndexInput in;
	final int[] scratchDocIDs;
	final byte[] scratchPackedValue;

	// Minimum point of the N-dim rect containing the query shape:
	final byte[] minPacked;
	// Maximum point of the N-dim rect containing the query shape:
	final byte[] maxPacked;
	final IntersectVisitor visitor;

	public IntersectState(IndexInput in, int packedBytesLength,
	int maxPointsInLeafNode, byte[] minPacked, byte[] maxPacked,
	IntersectVisitor visitor) {
	this.in = in;
	this.minPacked = minPacked;
	this.maxPacked = maxPacked;
	this.visitor = visitor;
	this.scratchDocIDs = new int[maxPointsInLeafNode];
	this.scratchPackedValue = new byte[packedBytesLength];
	}
	}

	public void intersect(IntersectVisitor visitor) throws IOException {
	byte[] minPacked = new byte[packedBytesLength];
	byte[] maxPacked = new byte[packedBytesLength];
	Arrays.fill(maxPacked, (byte) 0xff);
	intersect(minPacked, maxPacked, visitor);
	}

	public void intersect(byte[] minPacked, byte[] maxPacked, IntersectVisitor visitor) throws IOException {
	IntersectState state = new IntersectState(in.clone(), packedBytesLength,
	maxPointsInLeafNode, minPacked, maxPacked,
	visitor);
	byte[] rootMinPacked = new byte[packedBytesLength];
	byte[] rootMaxPacked = new byte[packedBytesLength];
	Arrays.fill(rootMaxPacked, (byte) 0xff);
	intersect(state, 1, rootMinPacked, rootMaxPacked);
	}

	/** Fast path: this is called when the query box fully encompasses all cells under this node. */
	private void addAll(IntersectState state, int nodeID) throws IOException {
	//System.out.println("R: addAll nodeID=" + nodeID);

	if (nodeID >= leafNodeOffset) {
	//System.out.println("R: leaf");

	// Leaf node
	state.in.seek(leafBlockFPs[nodeID-leafNodeOffset]);

	// How many points are stored in this leaf cell:
	int count = state.in.readVInt();

	// TODO: especially for the 1D case, this was a decent speedup, because caller could know it should budget for around XXX docs:
	//state.docs.grow(count);
	int docID = 0;
	for(int i=0;i<count;i++) {
	docID += state.in.readVInt();
	state.visitor.visit(docID);
	}
	} else {
	addAll(state, 2*nodeID);
	addAll(state, 2*nodeID+1);
	}
	}

	private void intersect(IntersectState state,
	int nodeID,
	byte[] cellMinPacked, byte[] cellMaxPacked)
	throws IOException {

	//System.out.println("\nR: intersect nodeID=" + nodeID + " cellMin=" + BKDUtil.bytesToInt(cellMinPacked, 0) + " cellMax=" + BKDUtil.bytesToInt(cellMaxPacked, 0));

	// Optimization: only check the visitor when the current cell does not fully contain the bbox. E.g. if the
	// query is a small area around London, UK, most of the high nodes in the BKD tree as we recurse will fully
	// contain the query, so we quickly recurse down until the nodes cross the query:
	boolean cellContainsQuery = BKDUtil.contains(bytesPerDim,
	cellMinPacked, cellMaxPacked,
	state.minPacked, state.maxPacked);

	//System.out.println("R: cellContainsQuery=" + cellContainsQuery);

	if (cellContainsQuery == false) {

	Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
	//System.out.println("R: relation=" + r);

	if (r == Relation.QUERY_OUTSIDE_CELL) {
	// This cell is fully outside of the query shape: stop recursing
	return;
	} else if (r == Relation.CELL_INSIDE_QUERY) {
	// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
	addAll(state, nodeID);
	return;
	} else {
	// The cell crosses the shape boundary, so we fall through and do full filtering
	}
	}

	if (nodeID >= leafNodeOffset) {
	// Leaf node; scan and filter all points in this block:
	//System.out.println(" intersect leaf nodeID=" + nodeID + " vs leafNodeOffset=" + leafNodeOffset + " fp=" + leafBlockFPs[nodeID-leafNodeOffset]);

	state.in.seek(leafBlockFPs[nodeID-leafNodeOffset]);

	// How many points are stored in this leaf cell:
	int count = state.in.readVInt();

	// TODO: we could maybe pollute the IntersectVisitor API with a "grow" method if this maybe helps perf
	// enough (it did before, esp. for the 1D case):
	//state.docs.grow(count);
	int docID = 0;
	for(int i=0;i<count;i++) {
	docID += state.in.readVInt();
	state.scratchDocIDs[i] = docID;
	}

	// Again, this time reading values and checking with the visitor
	for(int i=0;i<count;i++) {
	state.in.readBytes(state.scratchPackedValue, 0, state.scratchPackedValue.length);
	state.visitor.visit(state.scratchDocIDs[i], state.scratchPackedValue);
	}

	} else {

	// Non-leaf node: recurse on the split left and right nodes

	int address = nodeID * (bytesPerDim+1);
	int splitDim = splitPackedValues[address] & 0xff;
	assert splitDim < numDims;

	// TODO: can we alloc & reuse this up front?
	byte[] splitValue = new byte[bytesPerDim];
	System.arraycopy(splitPackedValues, address+1, splitValue, 0, bytesPerDim);

	// TODO: can we alloc & reuse this up front?
	byte[] splitPackedValue = new byte[packedBytesLength];

	if (BKDUtil.compare(bytesPerDim, state.minPacked, splitDim, splitValue, 0) <= 0) {
	// The query bbox overlaps our left cell, so we must recurse:
	System.arraycopy(state.maxPacked, 0, splitPackedValue, 0, packedBytesLength);
	System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
	intersect(state,
	2*nodeID,
	cellMinPacked, splitPackedValue);
	}

	if (BKDUtil.compare(bytesPerDim, state.maxPacked, splitDim, splitValue, 0) >= 0) {
	// The query bbox overlaps our left cell, so we must recurse:
	System.arraycopy(state.minPacked, 0, splitPackedValue, 0, packedBytesLength);
	System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
	intersect(state,
	2*nodeID+1,
	splitPackedValue, cellMaxPacked);
	}
	}
	}

	@Override
	public long ramBytesUsed() {
	return splitPackedValues.length +
	leafBlockFPs.length * RamUsageEstimator.NUM_BYTES_LONG;
	}
	}