blob: c630e43659fa5f2e8633a3d51c9de61fe63a0342 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDReader;
/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
final class SimpleTextBKDReader extends PointValues implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
private final byte[] splitPackedValues;
final long[] leafBlockFPs;
private final int leafNodeOffset;
final int numDims;
final int numIndexDims;
final int bytesPerDim;
final int bytesPerIndexEntry;
final IndexInput in;
final int maxPointsInLeafNode;
final byte[] minPackedValue;
final byte[] maxPackedValue;
final long pointCount;
final int docCount;
final int version;
protected final int packedBytesLength;
protected final int packedIndexBytesLength;
public SimpleTextBKDReader(
IndexInput in,
int numDims,
int numIndexDims,
int maxPointsInLeafNode,
int bytesPerDim,
long[] leafBlockFPs,
byte[] splitPackedValues,
byte[] minPackedValue,
byte[] maxPackedValue,
long pointCount,
int docCount)
throws IOException {
this.in = in;
this.numDims = numDims;
this.numIndexDims = numIndexDims;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.bytesPerDim = bytesPerDim;
// no version check here because callers of this API (SimpleText) have no back compat:
bytesPerIndexEntry = numIndexDims == 1 ? bytesPerDim : bytesPerDim + 1;
packedBytesLength = numDims * bytesPerDim;
packedIndexBytesLength = numIndexDims * bytesPerDim;
this.leafNodeOffset = leafBlockFPs.length;
this.leafBlockFPs = leafBlockFPs;
this.splitPackedValues = splitPackedValues;
this.minPackedValue = minPackedValue;
this.maxPackedValue = maxPackedValue;
this.pointCount = pointCount;
this.docCount = docCount;
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
assert minPackedValue.length == packedIndexBytesLength;
assert maxPackedValue.length == packedIndexBytesLength;
}
/** Used to track all state for a single call to {@link #intersect}. */
public static final class IntersectState {
final IndexInput in;
final int[] scratchDocIDs;
final byte[] scratchPackedValue;
final int[] commonPrefixLengths;
final IntersectVisitor visitor;
public IntersectState(
IndexInput in,
int numDims,
int packedBytesLength,
int maxPointsInLeafNode,
IntersectVisitor visitor) {
this.in = in;
this.visitor = visitor;
this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
this.scratchPackedValue = new byte[packedBytesLength];
}
}
public void intersect(IntersectVisitor visitor) throws IOException {
intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
}
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
private void addAll(IntersectState state, int nodeID) throws IOException {
// System.out.println("R: addAll nodeID=" + nodeID);
if (nodeID >= leafNodeOffset) {
// System.out.println("ADDALL");
visitDocIDs(state.in, leafBlockFPs[nodeID - leafNodeOffset], state.visitor);
// TODO: we can assert that the first value here in fact matches what the index claimed?
} else {
addAll(state, 2 * nodeID);
addAll(state, 2 * nodeID + 1);
}
}
/** Create a new {@link IntersectState} */
public IntersectState getIntersectState(IntersectVisitor visitor) {
return new IntersectState(in.clone(), numDims, packedBytesLength, maxPointsInLeafNode, visitor);
}
/** Visits all docIDs and packed values in a single leaf block */
public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
int leafID = nodeID - leafNodeOffset;
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(
state.commonPrefixLengths,
state.scratchPackedValue,
state.in,
state.scratchDocIDs,
count,
state.visitor);
}
void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
in.seek(blockFP);
readLine(in, scratch);
int count = parseInt(scratch, BLOCK_COUNT);
visitor.grow(count);
for (int i = 0; i < count; i++) {
readLine(in, scratch);
visitor.visit(parseInt(scratch, BLOCK_DOC_ID));
}
}
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
in.seek(blockFP);
readLine(in, scratch);
int count = parseInt(scratch, BLOCK_COUNT);
for (int i = 0; i < count; i++) {
readLine(in, scratch);
docIDs[i] = parseInt(scratch, BLOCK_DOC_ID);
}
return count;
}
void visitDocValues(
int[] commonPrefixLengths,
byte[] scratchPackedValue,
IndexInput in,
int[] docIDs,
int count,
IntersectVisitor visitor)
throws IOException {
visitor.grow(count);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == packedBytesLength;
BytesRefBuilder scratch = new BytesRefBuilder();
for (int i = 0; i < count; i++) {
readLine(in, scratch);
assert startsWith(scratch, BLOCK_VALUE);
BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
assert br.length == packedBytesLength;
System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, packedBytesLength);
visitor.visit(docIDs[i], scratchPackedValue);
}
}
private void visitCompressedDocValues(
int[] commonPrefixLengths,
byte[] scratchPackedValue,
IndexInput in,
int[] docIDs,
int count,
IntersectVisitor visitor,
int compressedDim)
throws IOException {
// the byte at `compressedByteOffset` is compressed using run-length compression,
// other suffix bytes are stored verbatim
final int compressedByteOffset =
compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
commonPrefixLengths[compressedDim]++;
int i;
for (i = 0; i < count; ) {
scratchPackedValue[compressedByteOffset] = in.readByte();
final int runLen = Byte.toUnsignedInt(in.readByte());
for (int j = 0; j < runLen; ++j) {
for (int dim = 0; dim < numDims; dim++) {
int prefix = commonPrefixLengths[dim];
in.readBytes(scratchPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
}
visitor.visit(docIDs[i + j], scratchPackedValue);
}
i += runLen;
}
if (i != count) {
throw new CorruptIndexException(
"Sub blocks do not add up to the expected count: " + count + " != " + i, in);
}
}
private int readCompressedDim(IndexInput in) throws IOException {
int compressedDim = in.readByte();
if (compressedDim < -1 || compressedDim >= numIndexDims) {
throw new CorruptIndexException("Got compressedDim=" + compressedDim, in);
}
return compressedDim;
}
private void readCommonPrefixes(
int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
for (int dim = 0; dim < numDims; dim++) {
int prefix = in.readVInt();
commonPrefixLengths[dim] = prefix;
if (prefix > 0) {
in.readBytes(scratchPackedValue, dim * bytesPerDim, prefix);
}
// System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
}
}
private void intersect(
IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked)
throws IOException {
/*
System.out.println("\nR: intersect nodeID=" + nodeID);
for(int dim=0;dim<numDims;dim++) {
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
}
*/
Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
return;
} else if (r == Relation.CELL_INSIDE_QUERY) {
// This cell is fully inside of the query shape: recursively add all points in this cell
// without filtering
addAll(state, nodeID);
return;
} else {
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall
// through and do full filtering
}
if (nodeID >= leafNodeOffset) {
// TODO: we can assert that the first value here in fact matches what the index claimed?
int leafID = nodeID - leafNodeOffset;
// In the unbalanced case it's possible the left most node only has one child:
if (leafID < leafBlockFPs.length) {
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(
state.commonPrefixLengths,
state.scratchPackedValue,
state.in,
state.scratchDocIDs,
count,
state.visitor);
}
} else {
// Non-leaf node: recurse on the split left and right nodes
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numIndexDims == 1) {
splitDim = 0;
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numIndexDims;
// TODO: can we alloc & reuse this up front?
byte[] splitPackedValue = new byte[packedIndexBytesLength];
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedIndexBytesLength);
System.arraycopy(
splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
intersect(state, 2 * nodeID, cellMinPacked, splitPackedValue);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedIndexBytesLength);
System.arraycopy(
splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
intersect(state, 2 * nodeID + 1, splitPackedValue, cellMaxPacked);
}
}
@Override
public long estimatePointCount(IntersectVisitor visitor) {
return estimatePointCount(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
}
private long estimatePointCount(
IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) {
Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
return 0L;
} else if (nodeID >= leafNodeOffset) {
// Assume all points match and there are no dups
return maxPointsInLeafNode;
} else {
// Non-leaf node: recurse on the split left and right nodes
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numIndexDims == 1) {
splitDim = 0;
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numIndexDims;
// TODO: can we alloc & reuse this up front?
byte[] splitPackedValue = new byte[packedIndexBytesLength];
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedIndexBytesLength);
System.arraycopy(
splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
final long leftCost = estimatePointCount(state, 2 * nodeID, cellMinPacked, splitPackedValue);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedIndexBytesLength);
System.arraycopy(
splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
final long rightCost =
estimatePointCount(state, 2 * nodeID + 1, splitPackedValue, cellMaxPacked);
return leftCost + rightCost;
}
}
/** Copies the split value for this node into the provided byte array */
public void copySplitValue(int nodeID, byte[] splitPackedValue) {
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numIndexDims == 1) {
splitDim = 0;
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numIndexDims;
System.arraycopy(
splitPackedValues, address, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
}
@Override
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs);
}
@Override
public byte[] getMinPackedValue() {
return minPackedValue.clone();
}
@Override
public byte[] getMaxPackedValue() {
return maxPackedValue.clone();
}
@Override
public int getNumDimensions() {
return numDims;
}
@Override
public int getNumIndexDimensions() {
return numIndexDims;
}
@Override
public int getBytesPerDimension() {
return bytesPerDim;
}
@Override
public long size() {
return pointCount;
}
@Override
public int getDocCount() {
return docCount;
}
public boolean isLeafNode(int nodeID) {
return nodeID >= leafNodeOffset;
}
private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
assert startsWith(scratch, prefix);
return Integer.parseInt(stripPrefix(scratch, prefix));
}
private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) {
return new String(
scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
}
private boolean startsWith(BytesRefBuilder scratch, BytesRef prefix) {
return StringHelper.startsWith(scratch.get(), prefix);
}
private void readLine(IndexInput in, BytesRefBuilder scratch) throws IOException {
SimpleTextUtil.readLine(in, scratch);
}
}