blob: 38a22a285226093d66621f5be51be753943f1fa9 [file] [log] [blame]
package org.apache.lucene.util.bkd;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.RamUsageEstimator;
// TODO
// - the compression is somewhat stupid now (delta vInt for 1024 docIDs, no compression for the byte[] values even though they have high locality)
// - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
// - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
// only index docIDs
// - the index could be efficiently encoded as an FST, so we don't have wasteful
// (monotonic) long[] leafBlockFPs; or we could use MonotonicLongValues ... but then
// the index is already plenty small: 60M OSM points --> 1.1 MB with 128 points
// per leaf, and you can reduce that by putting more points per leaf
// - we could use threads while building; the higher nodes are very parallelizable
/** Recursively builds a block KD-tree to assign all incoming points in N-dim space to smaller
* and smaller N-dim rectangles (cells) until the number of points in a given
* rectangle is &lt;= <code>maxPointsInLeafNode</code>. The tree is
* fully balanced, which means the leaf nodes will have between 50% and 100% of
* the requested <code>maxPointsInLeafNode</code>. Values that fall exactly
* on a cell boundary may be in either cell.
*
* <p>The number of dimensions can be 1 to 255, but every byte[] value is fixed length.
*
* <p>
* See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
*
* <p>This consumes heap during writing: it allocates a <code>LongBitSet(numPoints)</code>,
* and then uses up to the specified {@code maxMBSortInHeap} heap space for writing.
*
* <p>
* <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total points, and
*
* @lucene.experimental */
public final class BKDWriter implements Closeable {
static final String CODEC_NAME = "BKD";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
/** Maximum number of dimensions */
public static final int MAX_DIMS = 15;
/** How many dimensions we are indexing */
final int numDims;
/** How many bytes each value in each dimension takes. */
final int bytesPerDim;
/** numDims * bytesPerDim */
final int packedBytesLength;
final TrackingDirectoryWrapper tempDir;
final String tempFileNamePrefix;
final byte[] scratchDiff;
final byte[] scratchPackedValue;
final byte[] scratch1;
final byte[] scratch2;
private OfflinePointWriter offlinePointWriter;
private HeapPointWriter heapPointWriter;
private IndexOutput tempInput;
private final int maxPointsInLeafNode;
private final int maxPointsSortInHeap;
private long pointCount;
public BKDWriter(Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim) throws IOException {
this(tempDir, tempFileNamePrefix, numDims, bytesPerDim, DEFAULT_MAX_POINTS_IN_LEAF_NODE, DEFAULT_MAX_MB_SORT_IN_HEAP);
}
public BKDWriter(Directory tempDir, String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap) throws IOException {
verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap);
// We use tracking dir to deal with removing files on exception, so each place that
// creates temp files doesn't need crazy try/finally/sucess logic:
this.tempDir = new TrackingDirectoryWrapper(tempDir);
this.tempFileNamePrefix = tempFileNamePrefix;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.numDims = numDims;
this.bytesPerDim = bytesPerDim;
packedBytesLength = numDims * bytesPerDim;
scratchDiff = new byte[bytesPerDim];
scratchPackedValue = new byte[packedBytesLength];
scratch1 = new byte[packedBytesLength];
scratch2 = new byte[packedBytesLength];
// dimensional values (numDims * bytesPerDim) + ord (long) + docID (int)
bytesPerDoc = packedBytesLength + RamUsageEstimator.NUM_BYTES_LONG + RamUsageEstimator.NUM_BYTES_INT;
// As we recurse, we compute temporary partitions of the data, halving the
// number of points at each recursion. Once there are few enough points,
// we can switch to sorting in heap instead of offline (on disk). At any
// time in the recursion, we hold the number of points at that level, plus
// all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
// what that level would consume, so we multiply by 0.5 to convert from
// bytes to points here. Each dimension has its own sorted partition, so
// we must divide by numDims as wel.
maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));
// Finally, we must be able to hold at least the leaf node in heap during build:
if (maxPointsSortInHeap < maxPointsInLeafNode) {
throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode");
}
// We write first maxPointsSortInHeap in heap, then cutover to offline for additional points:
heapPointWriter = new HeapPointWriter(16, maxPointsSortInHeap, packedBytesLength);
}
public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap) {
// We encode dim in a single byte in the splitPackedValues, but we only expose 4 bits for it now, in case we want to use
// remaining 4 bits for another purpose later
if (numDims < 1 || numDims > MAX_DIMS) {
throw new IllegalArgumentException("numDims must be 1 .. " + MAX_DIMS + " (got: " + numDims + ")");
}
if (maxPointsInLeafNode <= 0) {
throw new IllegalArgumentException("maxPointsInLeafNode must be > 0; got " + maxPointsInLeafNode);
}
if (maxPointsInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalArgumentException("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxPointsInLeafNode);
}
if (maxMBSortInHeap < 0.0) {
throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")");
}
}
/** If the current segment has too many points then we switchover to temp files / offline sort. */
private void switchToOffline() throws IOException {
// For each .add we just append to this input file, then in .finish we sort this input and resursively build the tree:
offlinePointWriter = new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength);
tempInput = offlinePointWriter.out;
PointReader reader = heapPointWriter.getReader(0);
for(int i=0;i<pointCount;i++) {
boolean hasNext = reader.next();
assert hasNext;
offlinePointWriter.append(reader.packedValue(), i, heapPointWriter.docIDs[i]);
}
heapPointWriter = null;
}
public void add(byte[] packedValue, int docID) throws IOException {
if (packedValue.length != packedBytesLength) {
throw new IllegalArgumentException("packedValue should be length=" + packedBytesLength + " (got: " + packedValue.length + ")");
}
if (pointCount >= maxPointsSortInHeap) {
if (offlinePointWriter == null) {
switchToOffline();
}
offlinePointWriter.append(packedValue, pointCount, docID);
} else {
// Not too many points added yet, continue using heap:
heapPointWriter.append(packedValue, pointCount, docID);
}
pointCount++;
}
// TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
// encoding and not have our own ByteSequencesReader/Writer
/** If dim=-1 we sort by docID, else by that dim. */
private void sortHeapPointWriter(final HeapPointWriter writer, int start, int length, int dim) {
assert pointCount < Integer.MAX_VALUE;
// All buffered points are still in heap; just do in-place sort:
new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int docID = writer.docIDs[i];
writer.docIDs[i] = writer.docIDs[j];
writer.docIDs[j] = docID;
long ord = writer.ords[i];
writer.ords[i] = writer.ords[j];
writer.ords[j] = ord;
// scratch1 = values[i]
writer.readPackedValue(i, scratch1);
// scratch2 = values[j]
writer.readPackedValue(j, scratch2);
// values[i] = scratch2
writer.writePackedValue(i, scratch2);
// values[j] = scratch1
writer.writePackedValue(j, scratch1);
}
@Override
protected int compare(int i, int j) {
if (dim != -1) {
writer.readPackedValue(i, scratch1);
writer.readPackedValue(j, scratch2);
int cmp = BKDUtil.compare(bytesPerDim, scratch1, dim, scratch2, dim);
if (cmp != 0) {
return cmp;
}
}
// Tie-break
int cmp = Integer.compare(writer.docIDs[i], writer.docIDs[j]);
if (cmp != 0) {
return cmp;
}
return Long.compare(writer.ords[i], writer.ords[j]);
}
}.sort(start, start+length);
}
private PointWriter sort(int dim) throws IOException {
if (heapPointWriter != null) {
assert tempInput == null;
// We never spilled the incoming points to disk, so now we sort in heap:
HeapPointWriter sorted;
if (dim == 0) {
// First dim can re-use the current heap writer
sorted = heapPointWriter;
} else {
// Subsequent dims need a private copy
sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedBytesLength);
sorted.copyFrom(heapPointWriter);
}
sortHeapPointWriter(sorted, 0, (int) pointCount, dim);
sorted.close();
return sorted;
} else {
// Offline sort:
assert tempInput != null;
final ByteArrayDataInput reader = new ByteArrayDataInput();
Comparator<BytesRef> cmp = new Comparator<BytesRef>() {
private final ByteArrayDataInput readerB = new ByteArrayDataInput();
@Override
public int compare(BytesRef a, BytesRef b) {
reader.reset(a.bytes, a.offset, a.length);
reader.readBytes(scratch1, 0, scratch1.length);
final int docIDA = reader.readVInt();
final long ordA = reader.readVLong();
reader.reset(b.bytes, b.offset, b.length);
reader.readBytes(scratch2, 0, scratch2.length);
final int docIDB = reader.readVInt();
final long ordB = reader.readVLong();
int cmp = BKDUtil.compare(bytesPerDim, scratch1, dim, scratch2, dim);
if (cmp != 0) {
return cmp;
}
// Tie-break
cmp = Integer.compare(docIDA, docIDB);
if (cmp != 0) {
return cmp;
}
return Long.compare(ordA, ordB);
}
};
// TODO: this is sort of sneaky way to get the final OfflinePointWriter from OfflineSorter:
IndexOutput[] lastWriter = new IndexOutput[1];
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, cmp) {
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override
protected ByteSequencesWriter getWriter(IndexOutput out) {
lastWriter[0] = out;
return new ByteSequencesWriter(out) {
@Override
public void write(byte[] bytes, int off, int len) throws IOException {
if (len != bytesPerDoc) {
throw new IllegalArgumentException("len=" + len + " bytesPerDoc=" + bytesPerDoc);
}
out.writeBytes(bytes, off, len);
}
};
}
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override
protected ByteSequencesReader getReader(IndexInput in) throws IOException {
return new ByteSequencesReader(in) {
@Override
public boolean read(BytesRefBuilder ref) throws IOException {
ref.grow(bytesPerDoc);
try {
in.readBytes(ref.bytes(), 0, bytesPerDoc);
} catch (EOFException eofe) {
return false;
}
ref.setLength(bytesPerDoc);
return true;
}
};
}
};
sorter.sort(tempInput.getName());
assert lastWriter[0] != null;
return new OfflinePointWriter(tempDir, lastWriter[0], packedBytesLength, pointCount);
}
}
/** Writes the BKD tree to the provided {@link IndexOutput} and returns the file offset where index was written. */
public long finish(IndexOutput out) throws IOException {
//System.out.println("\nBKDTreeWriter.finish pointCount=" + pointCount + " out=" + out + " heapWriter=" + heapWriter);
// TODO: specialize the 1D case? it's much faster at indexing time (no partitioning on recruse...)
if (offlinePointWriter != null) {
offlinePointWriter.close();
}
LongBitSet ordBitSet = new LongBitSet(pointCount);
long countPerLeaf = pointCount;
long innerNodeCount = 1;
while (countPerLeaf > maxPointsInLeafNode) {
countPerLeaf = (countPerLeaf+1)/2;
innerNodeCount *= 2;
}
//System.out.println("innerNodeCount=" + innerNodeCount);
if (1+2*innerNodeCount >= Integer.MAX_VALUE) {
throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
}
innerNodeCount--;
int numLeaves = (int) (innerNodeCount+1);
// NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each
// step of the recursion to recompute the split dim:
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the lead byte at each recursion says which dim we split on.
byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim))];
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7)
long[] leafBlockFPs = new long[numLeaves];
// Make sure the math above "worked":
assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
// Sort all docs once by each dimension:
PathSlice[] sortedPointWriters = new PathSlice[numDims];
byte[] minPacked = new byte[packedBytesLength];
byte[] maxPacked = new byte[packedBytesLength];
Arrays.fill(maxPacked, (byte) 0xff);
boolean success = false;
try {
for(int dim=0;dim<numDims;dim++) {
sortedPointWriters[dim] = new PathSlice(sort(dim), 0, pointCount);
}
if (tempInput != null) {
tempDir.deleteFile(tempInput.getName());
tempInput = null;
} else {
assert heapPointWriter != null;
heapPointWriter = null;
}
build(1, numLeaves, sortedPointWriters,
ordBitSet, out,
minPacked, maxPacked,
splitPackedValues,
leafBlockFPs);
for(PathSlice slice : sortedPointWriters) {
slice.writer.destroy();
}
// If no exception, we should have cleaned everything up:
assert tempDir.getCreatedFiles().isEmpty();
success = true;
} finally {
if (success == false) {
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles());
}
}
//System.out.println("Total nodes: " + innerNodeCount);
// Write index:
long indexFP = out.getFilePointer();
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
out.writeVInt(numDims);
out.writeVInt(maxPointsInLeafNode);
out.writeVInt(bytesPerDim);
out.writeVInt(numLeaves);
// NOTE: splitPackedValues[0] is unused, because nodeID is 1-based:
out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
for (int i=0;i<leafBlockFPs.length;i++) {
out.writeVLong(leafBlockFPs[i]);
}
return indexFP;
}
@Override
public void close() throws IOException {
if (tempInput != null) {
// NOTE: this should only happen on exception, e.g. caller calls close w/o calling finish:
try {
tempInput.close();
} finally {
tempDir.deleteFile(tempInput.getName());
tempInput = null;
}
}
}
/** Sliced reference to points in an OfflineSorter.ByteSequencesWriter file. */
private static final class PathSlice {
final PointWriter writer;
final long start;
final long count;
public PathSlice(PointWriter writer, long start, long count) {
this.writer = writer;
this.start = start;
this.count = count;
}
@Override
public String toString() {
return "PathSlice(start=" + start + " count=" + count + " writer=" + writer + ")";
}
}
/** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
private byte[] markRightTree(long rightCount, int splitDim, PathSlice source, LongBitSet ordBitSet) throws IOException {
// Now we mark ords that fall into the right half, so we can partition on all other dims that are not the split dim:
assert ordBitSet.cardinality() == 0: "cardinality=" + ordBitSet.cardinality();
// Read the split value, then mark all ords in the right tree (larger than the split value):
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount)) {
boolean result = reader.next();
assert result;
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
ordBitSet.set(reader.ord());
// Start at 1 because we already did the first value above (so we could keep the split value):
for(int i=1;i<rightCount;i++) {
result = reader.next();
assert result;
ordBitSet.set(reader.ord());
}
assert rightCount == ordBitSet.cardinality(): "rightCount=" + rightCount + " cardinality=" + ordBitSet.cardinality();
}
return scratch1;
}
/** Called only in assert */
private boolean valueInBounds(byte[] packedValue, byte[] minPackedValue, byte[] maxPackedValue) {
for(int dim=0;dim<numDims;dim++) {
if (BKDUtil.compare(bytesPerDim, packedValue, dim, minPackedValue, dim) < 0) {
return false;
}
if (BKDUtil.compare(bytesPerDim, packedValue, dim, maxPackedValue, dim) > 0) {
return false;
}
}
return true;
}
// TODO: make this protected when we want to subclass to play with different splitting criteria
private int split(byte[] minPackedValue, byte[] maxPackedValue) {
// Find which dim has the largest span so we can split on it:
int splitDim = -1;
for(int dim=0;dim<numDims;dim++) {
BKDUtil.subtract(bytesPerDim, dim, maxPackedValue, minPackedValue, scratchDiff);
if (splitDim == -1 || BKDUtil.compare(bytesPerDim, scratchDiff, 0, scratch1, 0) > 0) {
System.arraycopy(scratchDiff, 0, scratch1, 0, bytesPerDim);
splitDim = dim;
}
}
return splitDim;
}
/** Only called in the 1D case, to pull a partition back into heap once
* the point count is low enough while recursing. */
private PathSlice switchToHeap(PathSlice source) throws IOException {
int count = Math.toIntExact(source.count);
try (
PointWriter writer = new HeapPointWriter(count, count, packedBytesLength);
PointReader reader = source.writer.getReader(source.start);
) {
for(int i=0;i<count;i++) {
boolean hasNext = reader.next();
assert hasNext;
writer.append(reader.packedValue(), reader.ord(), reader.docID());
}
return new PathSlice(writer, 0, count);
}
}
/** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
private void build(int nodeID, int leafNodeOffset,
PathSlice[] slices,
LongBitSet ordBitSet,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs) throws IOException {
for(PathSlice slice : slices) {
assert slice.count == slices[0].count;
}
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
// Special case for 1D, to cutover to heap once we recurse deeply enough:
slices[0] = switchToHeap(slices[0]);
}
if (nodeID >= leafNodeOffset) {
// Leaf node: write block
PathSlice source = slices[0];
if (source.writer instanceof HeapPointWriter == false) {
// Adversarial cases can cause this, e.g. very lopsided data, all equal points
source = switchToHeap(source);
}
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
HeapPointWriter heapSource = (HeapPointWriter) source.writer;
// Sort by docID in the leaf so we can delta-vInt encode:
sortHeapPointWriter(heapSource, Math.toIntExact(source.start), Math.toIntExact(source.count), -1);
int lastDocID = 0;
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
out.writeVInt(Math.toIntExact(source.count));
// Write docIDs first, as their own chunk, so that at intersect time we can add all docIDs w/o
// loading the values:
for (int i=0;i<source.count;i++) {
int docID = heapSource.docIDs[Math.toIntExact(source.start + i)];
out.writeVInt(docID - lastDocID);
lastDocID = docID;
}
// TODO: we should delta compress / only write suffix bytes, like terms dict (the values will all be "close together" since we are at
// a leaf cell):
// Now write the full values:
for (int i=0;i<source.count;i++) {
// TODO: we could do bulk copying here, avoiding the intermediate copy:
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
// Make sure this value does in fact fall within this leaf cell:
assert valueInBounds(scratchPackedValue, minPackedValue, maxPackedValue);
out.writeBytes(scratchPackedValue, 0, scratchPackedValue.length);
}
} else {
// Inner node: partition/recurse
int splitDim = split(minPackedValue, maxPackedValue);
PathSlice source = slices[splitDim];
assert nodeID < splitPackedValues.length: "nodeID=" + nodeID + " splitValues.length=" + splitPackedValues.length;
// How many points will be in the left tree:
long rightCount = source.count / 2;
long leftCount = source.count - rightCount;
byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet);
int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim;
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDim);
// Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
PathSlice[] leftSlices = new PathSlice[numDims];
PathSlice[] rightSlices = new PathSlice[numDims];
byte[] minSplitPackedValue = new byte[packedBytesLength];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedBytesLength);
byte[] maxSplitPackedValue = new byte[packedBytesLength];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedBytesLength);
for(int dim=0;dim<numDims;dim++) {
if (dim == splitDim) {
// No need to partition on this dim since it's a simple slice of the incoming already sorted slice.
leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount);
rightSlices[dim] = new PathSlice(source.writer, source.start + leftCount, rightCount);
System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim);
System.arraycopy(splitValue, 0, maxSplitPackedValue, dim*bytesPerDim, bytesPerDim);
continue;
}
try (PointWriter leftPointWriter = getPointWriter(leftCount);
PointWriter rightPointWriter = getPointWriter(source.count - leftCount);
PointReader reader = slices[dim].writer.getReader(slices[dim].start);) {
// Partition this source according to how the splitDim split the values:
int nextRightCount = 0;
for (int i=0;i<source.count;i++) {
boolean result = reader.next();
assert result;
byte[] packedValue = reader.packedValue();
long ord = reader.ord();
int docID = reader.docID();
if (ordBitSet.get(ord)) {
rightPointWriter.append(packedValue, ord, docID);
nextRightCount++;
} else {
leftPointWriter.append(packedValue, ord, docID);
}
}
leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
assert rightCount == nextRightCount: "rightCount=" + rightCount + " nextRightCount=" + nextRightCount;
}
}
ordBitSet.clear(0, pointCount);
// Recurse on left tree:
build(2*nodeID, leafNodeOffset, leftSlices,
ordBitSet, out,
minPackedValue, maxSplitPackedValue,
splitPackedValues, leafBlockFPs);
for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
if (dim != splitDim) {
leftSlices[dim].writer.destroy();
}
}
// TODO: we could "tail recurse" here? have our parent discard its refs as we recurse right?
// Recurse on right tree:
build(2*nodeID+1, leafNodeOffset, rightSlices,
ordBitSet, out,
minSplitPackedValue, maxPackedValue,
splitPackedValues, leafBlockFPs);
for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
if (dim != splitDim) {
rightSlices[dim].writer.destroy();
}
}
}
}
PointWriter getPointWriter(long count) throws IOException {
if (count <= maxPointsSortInHeap) {
int size = Math.toIntExact(count);
return new HeapPointWriter(size, size, packedBytesLength);
} else {
return new OfflinePointWriter(tempDir, tempFileNamePrefix, packedBytesLength);
}
}
}