| diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java |
| index bf360d3..5e391f4 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java |
| @@ -479,9 +479,12 @@ public class BKDWriter implements Closeable { |
| docsSeen.set(values.getDocID(i)); |
| } |
| |
| + final int[] parentSplits = new int[numDims]; |
| build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out, |
| - minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, |
| + minPackedValue, maxPackedValue, parentSplits, |
| + splitPackedValues, leafBlockFPs, |
| new int[maxPointsInLeafNode]); |
| + assert Arrays.equals(parentSplits, new int[numDims]); |
| |
| long indexFP = out.getFilePointer(); |
| writeIndex(out, leafBlockFPs, splitPackedValues); |
| @@ -1001,12 +1004,15 @@ public class BKDWriter implements Closeable { |
| heapPointWriter = null; |
| } |
| |
| + final int[] parentSplits = new int[numDims]; |
| build(1, numLeaves, sortedPointWriters, |
| ordBitSet, out, |
| minPackedValue, maxPackedValue, |
| + parentSplits, |
| splitPackedValues, |
| leafBlockFPs, |
| toCloseHeroically); |
| + assert Arrays.equals(parentSplits, new int[numDims]); |
| |
| for(PathSlice slice : sortedPointWriters) { |
| slice.writer.destroy(); |
| @@ -1413,7 +1419,29 @@ public class BKDWriter implements Closeable { |
| return true; |
| } |
| |
| - protected int split(byte[] minPackedValue, byte[] maxPackedValue) { |
| + /** |
| + * Pick the next dimension to split. |
| + * @param minPackedValue the min values for all dimensions |
| + * @param maxPackedValue the max values for all dimensions |
| + * @param parentSplits how many times each dim has been split on the parent levels |
| + * @return the dimension to split |
| + */ |
| + protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits) { |
| + // First look at whether there is a dimension that has split less than 2x less than |
| + // the dim that has most splits, and return it if there is such a dimension and it |
| + // does not only have equals values. This helps ensure all dimensions are indexed. |
| + int maxNumSplits = 0; |
| + for (int numSplits : parentSplits) { |
| + maxNumSplits = Math.max(maxNumSplits, numSplits); |
| + } |
| + for (int dim = 0; dim < numDims; ++dim) { |
| + final int offset = dim * bytesPerDim; |
| + if (parentSplits[dim] < maxNumSplits / 2 && |
| + StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) != 0) { |
| + return dim; |
| + } |
| + } |
| + |
| // Find which dim has the largest span so we can split on it: |
| int splitDim = -1; |
| for(int dim=0;dim<numDims;dim++) { |
| @@ -1454,6 +1482,7 @@ public class BKDWriter implements Closeable { |
| MutablePointValues reader, int from, int to, |
| IndexOutput out, |
| byte[] minPackedValue, byte[] maxPackedValue, |
| + int[] parentSplits, |
| byte[] splitPackedValues, |
| long[] leafBlockFPs, |
| int[] spareDocIds) throws IOException { |
| @@ -1547,7 +1576,7 @@ public class BKDWriter implements Closeable { |
| // inner node |
| |
| // compute the split dimension and partition around it |
| - final int splitDim = split(minPackedValue, maxPackedValue); |
| + final int splitDim = split(minPackedValue, maxPackedValue, parentSplits); |
| final int mid = (from + to + 1) >>> 1; |
| |
| int commonPrefixLen = bytesPerDim; |
| @@ -1575,10 +1604,14 @@ public class BKDWriter implements Closeable { |
| maxSplitPackedValue, splitDim * bytesPerDim, bytesPerDim); |
| |
| // recurse |
| + parentSplits[splitDim]++; |
| build(nodeID * 2, leafNodeOffset, reader, from, mid, out, |
| - minPackedValue, maxSplitPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); |
| + minPackedValue, maxSplitPackedValue, parentSplits, |
| + splitPackedValues, leafBlockFPs, spareDocIds); |
| build(nodeID * 2 + 1, leafNodeOffset, reader, mid, to, out, |
| - minSplitPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs, spareDocIds); |
| + minSplitPackedValue, maxPackedValue, parentSplits, |
| + splitPackedValues, leafBlockFPs, spareDocIds); |
| + parentSplits[splitDim]--; |
| } |
| } |
| |
| @@ -1589,6 +1622,7 @@ public class BKDWriter implements Closeable { |
| LongBitSet ordBitSet, |
| IndexOutput out, |
| byte[] minPackedValue, byte[] maxPackedValue, |
| + int[] parentSplits, |
| byte[] splitPackedValues, |
| long[] leafBlockFPs, |
| List<Closeable> toCloseHeroically) throws IOException { |
| @@ -1699,7 +1733,7 @@ public class BKDWriter implements Closeable { |
| |
| int splitDim; |
| if (numDims > 1) { |
| - splitDim = split(minPackedValue, maxPackedValue); |
| + splitDim = split(minPackedValue, maxPackedValue, parentSplits); |
| } else { |
| splitDim = 0; |
| } |
| @@ -1767,10 +1801,11 @@ public class BKDWriter implements Closeable { |
| } |
| } |
| |
| + parentSplits[splitDim]++; |
| // Recurse on left tree: |
| build(2*nodeID, leafNodeOffset, leftSlices, |
| ordBitSet, out, |
| - minPackedValue, maxSplitPackedValue, |
| + minPackedValue, maxSplitPackedValue, parentSplits, |
| splitPackedValues, leafBlockFPs, toCloseHeroically); |
| for(int dim=0;dim<numDims;dim++) { |
| // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim: |
| @@ -1783,7 +1818,7 @@ public class BKDWriter implements Closeable { |
| // Recurse on right tree: |
| build(2*nodeID+1, leafNodeOffset, rightSlices, |
| ordBitSet, out, |
| - minSplitPackedValue, maxPackedValue, |
| + minSplitPackedValue, maxPackedValue, parentSplits, |
| splitPackedValues, leafBlockFPs, toCloseHeroically); |
| for(int dim=0;dim<numDims;dim++) { |
| // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim: |
| @@ -1791,6 +1826,7 @@ public class BKDWriter implements Closeable { |
| rightSlices[dim].writer.destroy(); |
| } |
| } |
| + parentSplits[splitDim]--; |
| } |
| } |
| |
| diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java |
| index 8b9b7a5..c5c5c1f 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java |
| +++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java |
| @@ -505,7 +505,45 @@ public class TestBKD extends LuceneTestCase { |
| } |
| } |
| |
| - verify(docValues, null, numDims, numBytesPerDim); |
| + // Use a small number of points in leaf blocks to trigger a lot of splitting |
| + verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50)); |
| + } |
| + |
| + // This triggers the logic that makes sure all dimensions get indexed |
| + // by looking at how many times each dim has been split |
| + public void testOneDimLowCard() throws Exception { |
| + int numBytesPerDim = TestUtil.nextInt(random(), 2, 30); |
| + int numDims = TestUtil.nextInt(random(), 2, 5); |
| + |
| + int numDocs = atLeast(10000); |
| + int theLowCardDim = random().nextInt(numDims); |
| + |
| + byte[] value1 = new byte[numBytesPerDim]; |
| + random().nextBytes(value1); |
| + byte[] value2 = value1.clone(); |
| + if (value2[numBytesPerDim-1] == 0 || random().nextBoolean()) { |
| + value2[numBytesPerDim-1]++; |
| + } else { |
| + value2[numBytesPerDim-1]--; |
| + } |
| + |
| + byte[][][] docValues = new byte[numDocs][][]; |
| + |
| + for(int docID=0;docID<numDocs;docID++) { |
| + byte[][] values = new byte[numDims][]; |
| + for(int dim=0;dim<numDims;dim++) { |
| + if (dim == theLowCardDim) { |
| + values[dim] = random().nextBoolean() ? value1 : value2; |
| + } else { |
| + values[dim] = new byte[numBytesPerDim]; |
| + random().nextBytes(values[dim]); |
| + } |
| + } |
| + docValues[docID] = values; |
| + } |
| + |
| + // Use a small number of points in leaf blocks to trigger a lot of splitting |
| + verify(docValues, null, numDims, numBytesPerDim, TestUtil.nextInt(random(), 20, 50)); |
| } |
| |
| // this should trigger run-length compression with lengths that are greater than 255 |
| @@ -567,12 +605,14 @@ public class TestBKD extends LuceneTestCase { |
| verify(docValuesArray, docIDsArray, numDims, numBytesPerDim); |
| } |
| |
| - |
| - |
| /** docIDs can be null, for the single valued case, else it maps value to docID */ |
| private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim) throws Exception { |
| + verify(docValues, docIDs, numDims, numBytesPerDim, TestUtil.nextInt(random(), 50, 1000)); |
| + } |
| + |
| + private void verify(byte[][][] docValues, int[] docIDs, int numDims, int numBytesPerDim, |
| + int maxPointsInLeafNode) throws Exception { |
| try (Directory dir = getDirectory(docValues.length)) { |
| - int maxPointsInLeafNode = TestUtil.nextInt(random(), 50, 1000); |
| double maxMB = (float) 3.0 + (3*random().nextDouble()); |
| verify(dir, docValues, docIDs, numDims, numBytesPerDim, maxPointsInLeafNode, maxMB); |
| } |
| diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java |
| index 161d1a9..df28105 100644 |
| --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java |
| +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java |
| @@ -294,7 +294,7 @@ public class RandomCodec extends AssertingCodec { |
| } |
| |
| @Override |
| - protected int split(byte[] minPackedValue, byte[] maxPackedValue) { |
| + protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentDims) { |
| // BKD normally defaults by the widest dimension, to try to make as squarish cells as possible, but we just pick a random one ;) |
| return random.nextInt(numDims); |
| } |