blob: eed204baf270d856a613bc714c1697c0ba353f89 [file] [log] [blame]
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index bf360d3..5e391f4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -479,9 +479,12 @@ public class BKDWriter implements Closeable {
docsSeen.set(values.getDocID(i));
}
+ final int[] parentSplits = new int[numDims];
build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
- minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
+ minPackedValue, maxPackedValue, parentSplits,
+ splitPackedValues, leafBlockFPs,
new int[maxPointsInLeafNode]);
+ assert Arrays.equals(parentSplits, new int[numDims]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
@@ -1001,12 +1004,15 @@ public class BKDWriter implements Closeable {
heapPointWriter = null;
}
+ final int[] parentSplits = new int[numDims];
build(1, numLeaves, sortedPointWriters,
ordBitSet, out,
minPackedValue, maxPackedValue,
+ parentSplits,
splitPackedValues,
leafBlockFPs,
toCloseHeroically);
+ assert Arrays.equals(parentSplits, new int[numDims]);
for(PathSlice slice : sortedPointWriters) {
slice.writer.destroy();
@@ -1413,7 +1419,29 @@ public class BKDWriter implements Closeable {
return true;
}
- protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+ /**
+ * Pick the next dimension to split.
+ * @param minPackedValue the min values for all dimensions
+ * @param maxPackedValue the max values for all dimensions
+ * @param parentSplits how many times each dim has been split on the parent levels
+ * @return the dimension to split
+ */
+ protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits) {
+ // First look at whether there is a dimension that has split less than 2x less than
+ // the dim that has most splits, and return it if there is such a dimension and it
+ // does not only have equals values. This helps ensure all dimensions are indexed.
+ int maxNumSplits = 0;
+ for (int numSplits : parentSplits) {
+ maxNumSplits = Math.max(maxNumSplits, numSplits);
+ }
+ for (int dim = 0; dim < numDims; ++dim) {
+ final int offset = dim * bytesPerDim;
+ if (parentSplits[dim] < maxNumSplits / 2 &&
+ StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) != 0) {
+ return dim;
+ }
+ }
+
// Find which dim has the largest span so we can split on it:
int splitDim = -1;
for(int dim=0;dim<numDims;dim++) {
@@ -1454,6 +1482,7 @@ public class BKDWriter implements Closeable {
MutablePointValues reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
+ int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
@@ -1547,7 +1576,7 @@ public class BKDWriter implements Closeable {
// inner node
// compute the split dimension and partition around it
- final int splitDim = split(minPackedValue, maxPackedValue);
+ final int splitDim = split(minPackedValue, maxPackedValue, parentSplits);
final int mid = (from + to + 1) >>> 1;
int commonPrefixLen = bytesPerDim;
@@ -1575,10 +1604,14 @@ public class BKDWriter implements Closeable {
maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim);
// recurse
+ parentSplits[splitDim]++;
build(nodeID * 2, leafNodeOffset, reader, from, mid, out,
- minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+ minPackedValue, maxSplitPackedValue, parentSplits,
+ splitPackedValues, leafBlockFPs, spareDocIds);
build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out,
- minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds);
+ minSplitPackedValue, maxPackedValue, parentSplits,
+ splitPackedValues, leafBlockFPs, spareDocIds);
+ parentSplits[splitDim]--;
}
}
@@ -1589,6 +1622,7 @@ public class BKDWriter implements Closeable {
LongBitSet ordBitSet,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
+ int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs,
List<Closeable> toCloseHeroically) throws IOException {
@@ -1699,7 +1733,7 @@ public class BKDWriter implements Closeable {
int splitDim;
if (numDims > 1) {
- splitDim = split(minPackedValue, maxPackedValue);
+ splitDim = split(minPackedValue, maxPackedValue, parentSplits);
} else {
splitDim = 0;
}
@@ -1767,10 +1801,11 @@ public class BKDWriter implements Closeable {
}
}
+ parentSplits[splitDim]++;
// Recurse on left tree:
build(2*nodeID, leafNodeOffset, leftSlices,
ordBitSet, out,
- minPackedValue, maxSplitPackedValue,
+ minPackedValue, maxSplitPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
@@ -1783,7 +1818,7 @@ public class BKDWriter implements Closeable {
// Recurse on right tree:
build(2*nodeID+1, leafNodeOffset, rightSlices,
ordBitSet, out,
- minSplitPackedValue, maxPackedValue,
+ minSplitPackedValue, maxPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
@@ -1791,6 +1826,7 @@ public class BKDWriter implements Closeable {
rightSlices[dim].writer.destroy();
}
}
+ parentSplits[splitDim]--;
}
}
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index 8b9b7a5..c5c5c1f 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -505,7 +505,45 @@ public class TestBKD extends LuceneTestCase {
}
}
- verify(docValues, null, numDims, numBytesPerDim);
+ // Use a small number of points in leaf blocks to trigger a lot of splitting
+ verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50));
+ }
+
+ // This triggers the logic that makes sure all dimensions get indexed
+ // by looking at how many times each dim has been split
+ public void testOneDimLowCard() throws Exception {
+ int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
+ int numDims = TestUtil.nextInt(random(), 2, 5);
+
+ int numDocs = atLeast(10000);
+ int theLowCardDim = random().nextInt(numDims);
+
+ byte[] value1 = new byte[numBytesPerDim];
+ random().nextBytes(value1);
+ byte[] value2 = value1.clone();
+ if (value2[numBytesPerDim-1] == 0 || random().nextBoolean()) {
+ value2[numBytesPerDim-1]++;
+ } else {
+ value2[numBytesPerDim-1]--;
+ }
+
+ byte[][][] docValues = new byte[numDocs][][];
+
+ for(int docID=0;docID<numDocs;docID++) {
+ byte[][] values = new byte[numDims][];
+ for(int dim=0;dim<numDims;dim++) {
+ if (dim == theLowCardDim) {
+ values[dim] = random().nextBoolean() ? value1 : value2;
+ } else {
+ values[dim] = new byte[numBytesPerDim];
+ random().nextBytes(values[dim]);
+ }
+ }
+ docValues[docID] = values;
+ }
+
+ // Use a small number of points in leaf blocks to trigger a lot of splitting
+ verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50));
}
// this should trigger run-length compression with lengths that are greater than 255
@@ -567,12 +605,14 @@ public class TestBKD extends LuceneTestCase {
verify(docValuesArray, docIDsArray, numDims, numBytesPerDim);
}
-
-
/** docIDs can be null, for the single valued case, else it maps value to docID */
private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim) throws Exception {
+ verify(docValues, docIDs, numDims, numBytesPerDim, TestUtil.nextInt(random(), 50, 1000));
+ }
+
+ private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim,
+ int maxPointsInLeafNode) throws Exception {
try (Directory dir = getDirectory(docValues.length)) {
- int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000);
double maxMB = (float) 3.0 + (3*random().nextDouble());
verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB);
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
index 161d1a9..df28105 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@@ -294,7 +294,7 @@ public class RandomCodec extends AssertingCodec {
}
@Override
- protected int split(byte[] minPackedValue, byte[] maxPackedValue) {
+ protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentDims) {
// BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;)
return random.nextInt(numDims);
}