| package org.apache.lucene.util.bkd; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.index.DimensionalValues.IntersectVisitor; |
| import org.apache.lucene.index.DimensionalValues.Relation; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}. |
| * |
| * @lucene.experimental */ |
| |
| public class BKDReader implements Accountable { |
| // Packed array of byte[] holding all split values in the full binary tree: |
| final private byte[] splitPackedValues; |
| final private long[] leafBlockFPs; |
| final private int leafNodeOffset; |
| final int numDims; |
| final int bytesPerDim; |
| final IndexInput in; |
| final int maxPointsInLeafNode; |
| protected final int packedBytesLength; |
| |
| /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */ |
| public BKDReader(IndexInput in) throws IOException { |
| CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START); |
| numDims = in.readVInt(); |
| maxPointsInLeafNode = in.readVInt(); |
| bytesPerDim = in.readVInt(); |
| packedBytesLength = numDims * bytesPerDim; |
| |
| // Read index: |
| int numLeaves = in.readVInt(); |
| leafNodeOffset = numLeaves; |
| |
| splitPackedValues = new byte[(1+bytesPerDim)*numLeaves]; |
| in.readBytes(splitPackedValues, 0, splitPackedValues.length); |
| |
| // Tree is fully balanced binary tree, so number of nodes = numLeaves-1, except our nodeIDs are 1-based (splitPackedValues[0] is unused): |
| leafBlockFPs = new long[numLeaves]; |
| for(int i=0;i<numLeaves;i++) { |
| leafBlockFPs[i] = in.readVLong(); |
| } |
| |
| this.in = in; |
| } |
| |
| protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException { |
| this.in = in; |
| this.numDims = numDims; |
| this.maxPointsInLeafNode = maxPointsInLeafNode; |
| this.bytesPerDim = bytesPerDim; |
| packedBytesLength = numDims * bytesPerDim; |
| this.leafNodeOffset = leafBlockFPs.length; |
| this.leafBlockFPs = leafBlockFPs; |
| this.splitPackedValues = splitPackedValues; |
| } |
| |
| private static final class IntersectState { |
| final IndexInput in; |
| final int[] scratchDocIDs; |
| final byte[] scratchPackedValue; |
| |
| final IntersectVisitor visitor; |
| |
| public IntersectState(IndexInput in, int packedBytesLength, |
| int maxPointsInLeafNode, |
| IntersectVisitor visitor) { |
| this.in = in; |
| this.visitor = visitor; |
| this.scratchDocIDs = new int[maxPointsInLeafNode]; |
| this.scratchPackedValue = new byte[packedBytesLength]; |
| } |
| } |
| |
| public void intersect(IntersectVisitor visitor) throws IOException { |
| IntersectState state = new IntersectState(in.clone(), packedBytesLength, |
| maxPointsInLeafNode, |
| visitor); |
| byte[] rootMinPacked = new byte[packedBytesLength]; |
| byte[] rootMaxPacked = new byte[packedBytesLength]; |
| Arrays.fill(rootMaxPacked, (byte) 0xff); |
| intersect(state, 1, rootMinPacked, rootMaxPacked); |
| } |
| |
| /** Fast path: this is called when the query box fully encompasses all cells under this node. */ |
| private void addAll(IntersectState state, int nodeID) throws IOException { |
| //System.out.println("R: addAll nodeID=" + nodeID); |
| |
| if (nodeID >= leafNodeOffset) { |
| visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor); |
| } else { |
| addAll(state, 2*nodeID); |
| addAll(state, 2*nodeID+1); |
| } |
| } |
| |
| protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { |
| // Leaf node |
| in.seek(blockFP); |
| |
| // How many points are stored in this leaf cell: |
| int count = in.readVInt(); |
| |
| // TODO: especially for the 1D case, this was a decent speedup, because caller could know it should budget for around XXX docs: |
| //state.docs.grow(count); |
| int docID = 0; |
| for(int i=0;i<count;i++) { |
| docID += in.readVInt(); |
| visitor.visit(docID); |
| } |
| } |
| |
| protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { |
| in.seek(blockFP); |
| |
| // How many points are stored in this leaf cell: |
| int count = in.readVInt(); |
| |
| // TODO: we could maybe pollute the IntersectVisitor API with a "grow" method if this maybe helps perf |
| // enough (it did before, esp. for the 1D case): |
| //state.docs.grow(count); |
| int docID = 0; |
| for(int i=0;i<count;i++) { |
| docID += in.readVInt(); |
| docIDs[i] = docID; |
| } |
| |
| return count; |
| } |
| |
| protected void visitDocValues(byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { |
| for(int i=0;i<count;i++) { |
| in.readBytes(scratchPackedValue, 0, scratchPackedValue.length); |
| visitor.visit(docIDs[i], scratchPackedValue); |
| } |
| } |
| |
| private void intersect(IntersectState state, |
| int nodeID, |
| byte[] cellMinPacked, byte[] cellMaxPacked) |
| throws IOException { |
| |
| /* |
| System.out.println("\nR: intersect nodeID=" + nodeID); |
| for(int dim=0;dim<numDims;dim++) { |
| System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim)); |
| } |
| */ |
| |
| Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked); |
| |
| if (r == Relation.QUERY_OUTSIDE_CELL) { |
| // This cell is fully outside of the query shape: stop recursing |
| return; |
| } else if (r == Relation.CELL_INSIDE_QUERY) { |
| // This cell is fully inside of the query shape: recursively add all points in this cell without filtering |
| addAll(state, nodeID); |
| return; |
| } else { |
| // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering |
| } |
| |
| if (nodeID >= leafNodeOffset) { |
| // Leaf node; scan and filter all points in this block: |
| int count = readDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.scratchDocIDs); |
| |
| // Again, this time reading values and checking with the visitor |
| visitDocValues(state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor); |
| |
| } else { |
| |
| // Non-leaf node: recurse on the split left and right nodes |
| |
| int address = nodeID * (bytesPerDim+1); |
| int splitDim = splitPackedValues[address] & 0xff; |
| assert splitDim < numDims; |
| |
| // TODO: can we alloc & reuse this up front? |
| byte[] splitValue = new byte[bytesPerDim]; |
| System.arraycopy(splitPackedValues, address+1, splitValue, 0, bytesPerDim); |
| |
| // TODO: can we alloc & reuse this up front? |
| byte[] splitPackedValue = new byte[packedBytesLength]; |
| |
| // Recurse on left sub-tree: |
| System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength); |
| System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim); |
| intersect(state, |
| 2*nodeID, |
| cellMinPacked, splitPackedValue); |
| |
| // Recurse on right sub-tree: |
| System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength); |
| System.arraycopy(splitValue, 0, splitPackedValue, splitDim*bytesPerDim, bytesPerDim); |
| intersect(state, |
| 2*nodeID+1, |
| splitPackedValue, cellMaxPacked); |
| } |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return splitPackedValues.length + |
| leafBlockFPs.length * RamUsageEstimator.NUM_BYTES_LONG; |
| } |
| } |