| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.simpletext; |
| |
| import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_COUNT; |
| import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID; |
| import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE; |
| |
| import java.io.IOException; |
| import java.nio.charset.StandardCharsets; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.PointValues; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.bkd.BKDReader; |
| |
| /** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */ |
| final class SimpleTextBKDReader extends PointValues implements Accountable { |
| // Packed array of byte[] holding all split values in the full binary tree: |
| private final byte[] splitPackedValues; |
| final long[] leafBlockFPs; |
| private final int leafNodeOffset; |
| final int numDims; |
| final int numIndexDims; |
| final int bytesPerDim; |
| final int bytesPerIndexEntry; |
| final IndexInput in; |
| final int maxPointsInLeafNode; |
| final byte[] minPackedValue; |
| final byte[] maxPackedValue; |
| final long pointCount; |
| final int docCount; |
| final int version; |
| protected final int packedBytesLength; |
| protected final int packedIndexBytesLength; |
| |
| public SimpleTextBKDReader( |
| IndexInput in, |
| int numDims, |
| int numIndexDims, |
| int maxPointsInLeafNode, |
| int bytesPerDim, |
| long[] leafBlockFPs, |
| byte[] splitPackedValues, |
| byte[] minPackedValue, |
| byte[] maxPackedValue, |
| long pointCount, |
| int docCount) |
| throws IOException { |
| this.in = in; |
| this.numDims = numDims; |
| this.numIndexDims = numIndexDims; |
| this.maxPointsInLeafNode = maxPointsInLeafNode; |
| this.bytesPerDim = bytesPerDim; |
| // no version check here because callers of this API (SimpleText) have no back compat: |
| bytesPerIndexEntry = numIndexDims == 1 ? bytesPerDim : bytesPerDim + 1; |
| packedBytesLength = numDims * bytesPerDim; |
| packedIndexBytesLength = numIndexDims * bytesPerDim; |
| this.leafNodeOffset = leafBlockFPs.length; |
| this.leafBlockFPs = leafBlockFPs; |
| this.splitPackedValues = splitPackedValues; |
| this.minPackedValue = minPackedValue; |
| this.maxPackedValue = maxPackedValue; |
| this.pointCount = pointCount; |
| this.docCount = docCount; |
| this.version = SimpleTextBKDWriter.VERSION_CURRENT; |
| assert minPackedValue.length == packedIndexBytesLength; |
| assert maxPackedValue.length == packedIndexBytesLength; |
| } |
| |
| /** Used to track all state for a single call to {@link #intersect}. */ |
| public static final class IntersectState { |
| final IndexInput in; |
| final int[] scratchDocIDs; |
| final byte[] scratchPackedValue; |
| final int[] commonPrefixLengths; |
| |
| final IntersectVisitor visitor; |
| |
| public IntersectState( |
| IndexInput in, |
| int numDims, |
| int packedBytesLength, |
| int maxPointsInLeafNode, |
| IntersectVisitor visitor) { |
| this.in = in; |
| this.visitor = visitor; |
| this.commonPrefixLengths = new int[numDims]; |
| this.scratchDocIDs = new int[maxPointsInLeafNode]; |
| this.scratchPackedValue = new byte[packedBytesLength]; |
| } |
| } |
| |
| public void intersect(IntersectVisitor visitor) throws IOException { |
| intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue); |
| } |
| |
| /** Fast path: this is called when the query box fully encompasses all cells under this node. */ |
| private void addAll(IntersectState state, int nodeID) throws IOException { |
| // System.out.println("R: addAll nodeID=" + nodeID); |
| |
| if (nodeID >= leafNodeOffset) { |
| // System.out.println("ADDALL"); |
| visitDocIDs(state.in, leafBlockFPs[nodeID - leafNodeOffset], state.visitor); |
| // TODO: we can assert that the first value here in fact matches what the index claimed? |
| } else { |
| addAll(state, 2 * nodeID); |
| addAll(state, 2 * nodeID + 1); |
| } |
| } |
| |
| /** Create a new {@link IntersectState} */ |
| public IntersectState getIntersectState(IntersectVisitor visitor) { |
| return new IntersectState(in.clone(), numDims, packedBytesLength, maxPointsInLeafNode, visitor); |
| } |
| |
| /** Visits all docIDs and packed values in a single leaf block */ |
| public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException { |
| int leafID = nodeID - leafNodeOffset; |
| |
| // Leaf node; scan and filter all points in this block: |
| int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs); |
| |
| // Again, this time reading values and checking with the visitor |
| visitDocValues( |
| state.commonPrefixLengths, |
| state.scratchPackedValue, |
| state.in, |
| state.scratchDocIDs, |
| count, |
| state.visitor); |
| } |
| |
| void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException { |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| in.seek(blockFP); |
| readLine(in, scratch); |
| int count = parseInt(scratch, BLOCK_COUNT); |
| visitor.grow(count); |
| for (int i = 0; i < count; i++) { |
| readLine(in, scratch); |
| visitor.visit(parseInt(scratch, BLOCK_DOC_ID)); |
| } |
| } |
| |
| int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException { |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| in.seek(blockFP); |
| readLine(in, scratch); |
| int count = parseInt(scratch, BLOCK_COUNT); |
| for (int i = 0; i < count; i++) { |
| readLine(in, scratch); |
| docIDs[i] = parseInt(scratch, BLOCK_DOC_ID); |
| } |
| return count; |
| } |
| |
| void visitDocValues( |
| int[] commonPrefixLengths, |
| byte[] scratchPackedValue, |
| IndexInput in, |
| int[] docIDs, |
| int count, |
| IntersectVisitor visitor) |
| throws IOException { |
| visitor.grow(count); |
| // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths |
| assert scratchPackedValue.length == packedBytesLength; |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| for (int i = 0; i < count; i++) { |
| readLine(in, scratch); |
| assert startsWith(scratch, BLOCK_VALUE); |
| BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE)); |
| assert br.length == packedBytesLength; |
| System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, packedBytesLength); |
| visitor.visit(docIDs[i], scratchPackedValue); |
| } |
| } |
| |
| private void visitCompressedDocValues( |
| int[] commonPrefixLengths, |
| byte[] scratchPackedValue, |
| IndexInput in, |
| int[] docIDs, |
| int count, |
| IntersectVisitor visitor, |
| int compressedDim) |
| throws IOException { |
| // the byte at `compressedByteOffset` is compressed using run-length compression, |
| // other suffix bytes are stored verbatim |
| final int compressedByteOffset = |
| compressedDim * bytesPerDim + commonPrefixLengths[compressedDim]; |
| commonPrefixLengths[compressedDim]++; |
| int i; |
| for (i = 0; i < count; ) { |
| scratchPackedValue[compressedByteOffset] = in.readByte(); |
| final int runLen = Byte.toUnsignedInt(in.readByte()); |
| for (int j = 0; j < runLen; ++j) { |
| for (int dim = 0; dim < numDims; dim++) { |
| int prefix = commonPrefixLengths[dim]; |
| in.readBytes(scratchPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix); |
| } |
| visitor.visit(docIDs[i + j], scratchPackedValue); |
| } |
| i += runLen; |
| } |
| if (i != count) { |
| throw new CorruptIndexException( |
| "Sub blocks do not add up to the expected count: " + count + " != " + i, in); |
| } |
| } |
| |
| private int readCompressedDim(IndexInput in) throws IOException { |
| int compressedDim = in.readByte(); |
| if (compressedDim < -1 || compressedDim >= numIndexDims) { |
| throw new CorruptIndexException("Got compressedDim=" + compressedDim, in); |
| } |
| return compressedDim; |
| } |
| |
| private void readCommonPrefixes( |
| int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException { |
| for (int dim = 0; dim < numDims; dim++) { |
| int prefix = in.readVInt(); |
| commonPrefixLengths[dim] = prefix; |
| if (prefix > 0) { |
| in.readBytes(scratchPackedValue, dim * bytesPerDim, prefix); |
| } |
| // System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix); |
| } |
| } |
| |
| private void intersect( |
| IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) |
| throws IOException { |
| |
| /* |
| System.out.println("\nR: intersect nodeID=" + nodeID); |
| for(int dim=0;dim<numDims;dim++) { |
| System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim)); |
| } |
| */ |
| |
| Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked); |
| |
| if (r == Relation.CELL_OUTSIDE_QUERY) { |
| // This cell is fully outside of the query shape: stop recursing |
| return; |
| } else if (r == Relation.CELL_INSIDE_QUERY) { |
| // This cell is fully inside of the query shape: recursively add all points in this cell |
| // without filtering |
| addAll(state, nodeID); |
| return; |
| } else { |
| // The cell crosses the shape boundary, or the cell fully contains the query, so we fall |
| // through and do full filtering |
| } |
| |
| if (nodeID >= leafNodeOffset) { |
| // TODO: we can assert that the first value here in fact matches what the index claimed? |
| |
| int leafID = nodeID - leafNodeOffset; |
| |
| // In the unbalanced case it's possible the left most node only has one child: |
| if (leafID < leafBlockFPs.length) { |
| // Leaf node; scan and filter all points in this block: |
| int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs); |
| |
| // Again, this time reading values and checking with the visitor |
| visitDocValues( |
| state.commonPrefixLengths, |
| state.scratchPackedValue, |
| state.in, |
| state.scratchDocIDs, |
| count, |
| state.visitor); |
| } |
| |
| } else { |
| |
| // Non-leaf node: recurse on the split left and right nodes |
| |
| int address = nodeID * bytesPerIndexEntry; |
| int splitDim; |
| if (numIndexDims == 1) { |
| splitDim = 0; |
| } else { |
| splitDim = splitPackedValues[address++] & 0xff; |
| } |
| |
| assert splitDim < numIndexDims; |
| |
| // TODO: can we alloc & reuse this up front? |
| |
| byte[] splitPackedValue = new byte[packedIndexBytesLength]; |
| |
| // Recurse on left sub-tree: |
| System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedIndexBytesLength); |
| System.arraycopy( |
| splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim); |
| intersect(state, 2 * nodeID, cellMinPacked, splitPackedValue); |
| |
| // Recurse on right sub-tree: |
| System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedIndexBytesLength); |
| System.arraycopy( |
| splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim); |
| intersect(state, 2 * nodeID + 1, splitPackedValue, cellMaxPacked); |
| } |
| } |
| |
| @Override |
| public long estimatePointCount(IntersectVisitor visitor) { |
| return estimatePointCount(getIntersectState(visitor), 1, minPackedValue, maxPackedValue); |
| } |
| |
| private long estimatePointCount( |
| IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) { |
| Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked); |
| |
| if (r == Relation.CELL_OUTSIDE_QUERY) { |
| // This cell is fully outside of the query shape: stop recursing |
| return 0L; |
| } else if (nodeID >= leafNodeOffset) { |
| // Assume all points match and there are no dups |
| return maxPointsInLeafNode; |
| } else { |
| |
| // Non-leaf node: recurse on the split left and right nodes |
| |
| int address = nodeID * bytesPerIndexEntry; |
| int splitDim; |
| if (numIndexDims == 1) { |
| splitDim = 0; |
| } else { |
| splitDim = splitPackedValues[address++] & 0xff; |
| } |
| |
| assert splitDim < numIndexDims; |
| |
| // TODO: can we alloc & reuse this up front? |
| |
| byte[] splitPackedValue = new byte[packedIndexBytesLength]; |
| |
| // Recurse on left sub-tree: |
| System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedIndexBytesLength); |
| System.arraycopy( |
| splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim); |
| final long leftCost = estimatePointCount(state, 2 * nodeID, cellMinPacked, splitPackedValue); |
| |
| // Recurse on right sub-tree: |
| System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedIndexBytesLength); |
| System.arraycopy( |
| splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim); |
| final long rightCost = |
| estimatePointCount(state, 2 * nodeID + 1, splitPackedValue, cellMaxPacked); |
| return leftCost + rightCost; |
| } |
| } |
| |
| /** Copies the split value for this node into the provided byte array */ |
| public void copySplitValue(int nodeID, byte[] splitPackedValue) { |
| int address = nodeID * bytesPerIndexEntry; |
| int splitDim; |
| if (numIndexDims == 1) { |
| splitDim = 0; |
| } else { |
| splitDim = splitPackedValues[address++] & 0xff; |
| } |
| |
| assert splitDim < numIndexDims; |
| System.arraycopy( |
| splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim); |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs); |
| } |
| |
| @Override |
| public byte[] getMinPackedValue() { |
| return minPackedValue.clone(); |
| } |
| |
| @Override |
| public byte[] getMaxPackedValue() { |
| return maxPackedValue.clone(); |
| } |
| |
| @Override |
| public int getNumDimensions() { |
| return numDims; |
| } |
| |
| @Override |
| public int getNumIndexDimensions() { |
| return numIndexDims; |
| } |
| |
| @Override |
| public int getBytesPerDimension() { |
| return bytesPerDim; |
| } |
| |
| @Override |
| public long size() { |
| return pointCount; |
| } |
| |
| @Override |
| public int getDocCount() { |
| return docCount; |
| } |
| |
| public boolean isLeafNode(int nodeID) { |
| return nodeID >= leafNodeOffset; |
| } |
| |
| private int parseInt(BytesRefBuilder scratch, BytesRef prefix) { |
| assert startsWith(scratch, prefix); |
| return Integer.parseInt(stripPrefix(scratch, prefix)); |
| } |
| |
| private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) { |
| return new String( |
| scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8); |
| } |
| |
| private boolean startsWith(BytesRefBuilder scratch, BytesRef prefix) { |
| return StringHelper.startsWith(scratch.get(), prefix); |
| } |
| |
| private void readLine(IndexInput in, BytesRefBuilder scratch) throws IOException { |
| SimpleTextUtil.readLine(in, scratch); |
| } |
| } |