Updated class javadoc to include the following comment:
The code included here does work fine for moderate sized partitioning
tasks. As an example, using the test code in the test branch with the
partitioning task of splitting a data set of 1 billion items into 324
partitions of size 3M items completed in under 3 minutes, which was
performed on a single CPU. For much larger partitioning tasks, it is
recommended that this code be leveraged into a parallelized systems
environment.
I made some minor tweaks to the test code examples.
diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java
index b56356f..be256e4 100644
--- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java
+++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java
@@ -41,6 +41,12 @@
/**
* A partitioning process that can partition very large data sets into thousands
* of partitions of approximately the same size.
+ *
+ * <p>The code included here does work fine for moderate sized partitioning tasks.
+ * As an example, using the test code in the test branch with the partitioning task of splitting
+ * a data set of 1 billion items into 324 partitions of size 3M items completed in under 3 minutes, which was
+ * performed on a single CPU. For much larger partitioning tasks, it is recommended that this code be leveraged into a
+ * parallelized systems environment.</p>
* @param <T> the data type
* @param <S> the quantiles sketch that implements both QuantilesGenericAPI and PartitioningFeature.
*/
diff --git a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java
index 4c7e26a..52e6c50 100644
--- a/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java
+++ b/src/test/java/org/apache/datasketches/partitions/ClassicPartitionsTest.java
@@ -20,6 +20,7 @@
package org.apache.datasketches.partitions;
import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH;
+import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
import java.util.List;
@@ -27,6 +28,7 @@
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow;
import org.apache.datasketches.quantiles.ItemsSketch;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.testng.annotations.Test;
/**
@@ -44,42 +46,48 @@
* Launch the partitioner as an application with the following arguments as strings:
* <ul>
* <li>arg[0]: int k, the size of the sketch</li>
- * <li>arg[1]: long totalN, the total size, in elements, of the data set to parse.</li>
- * <li>arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
- * <li>arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
+ * <li>arg[1]: String INCLUSIVE or EXCLUSIVE, the search criteria.</li>
+ * <li>arg[2]: long totalN, the total size, in elements, of the data set to parse.</li>
+ * <li>arg[3]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
+ * <li>arg[4]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
* </ul>
* @param args input arguments as defined above.
*/
public void main(String[] args) {
final int k, maxPartsPerSk;
final long totalN, tgtPartitionSize;
+ final QuantileSearchCriteria searchCrit;
try {
k = Integer.parseInt(args[0].trim());
- totalN = Long.parseLong(args[1].trim());
- tgtPartitionSize = Long.parseLong(args[2].trim());
- maxPartsPerSk = Integer.parseInt(args[3].trim());
+ searchCrit = args[1].trim().equalsIgnoreCase("INCLUSIVE") ? INCLUSIVE : EXCLUSIVE;
+ totalN = Long.parseLong(args[2].trim());
+ tgtPartitionSize = Long.parseLong(args[3].trim());
+ maxPartsPerSk = Integer.parseInt(args[4].trim());
} catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); }
- classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
+ classicPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
}
//@Test //launch from TestNG
public void checkClassicPartitioner() {
final int k = 1 << 15;
- final long totalN = 1000_000_000L; //artificially set low so it will execute fast
+ final QuantileSearchCriteria searchCrit = INCLUSIVE;
+ final long totalN = 30_000_000L; //artificially set low so it will execute fast as a simple test
final long tgtPartitionSize = 3_000_000L;
final int maxPartsPerSk = 100;
- classicPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
+ classicPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
}
/**
* Programmatic call to classic Partitioner
* @param k the size of the sketch.
+ * @param searchCrit the QuantileSearchCriteria: either INCLUSIVE or EXCLUSIVE.
* @param totalN the total size, in elements, of the data set to parse.
* @param tgtPartitionSize the target number of elements per resulting partition.
* @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch.
*/
public void classicPartitioner(
final int k,
+ final QuantileSearchCriteria searchCrit,
final long totalN,
final long tgtPartitionSize,
final int maxPartsPerSk) {
@@ -92,7 +100,7 @@
tgtPartitionSize,
maxPartsPerSk,
fillReq,
- INCLUSIVE);
+ searchCrit);
final List<PartitionBoundsRow<String>> list = partitioner.partition(sk);
final long endTime_mS = System.currentTimeMillis();
final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS;
@@ -102,6 +110,7 @@
"Classic",
list,
k,
+ searchCrit,
totalN,
tgtPartitionSize,
maxPartsPerSk,
diff --git a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java
index 50eff57..3937d16 100644
--- a/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java
+++ b/src/test/java/org/apache/datasketches/partitions/KllPartitionsTest.java
@@ -20,6 +20,7 @@
package org.apache.datasketches.partitions;
import static org.apache.datasketches.partitions.BoundsRule.INCLUDE_BOTH;
+import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
import java.util.List;
@@ -27,6 +28,7 @@
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.kll.KllItemsSketch;
import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
import org.testng.annotations.Test;
/**
@@ -44,42 +46,48 @@
* Launch the partitioner as an application with the following arguments as strings:
* <ul>
* <li>arg[0]: int k, the size of the sketch</li>
- * <li>arg[1]: long totalN, the total size, in elements, of the data set to parse.</li>
- * <li>arg[2]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
- * <li>arg[3]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
+ * <li>arg[1]: String INCLUSIVE or EXCLUSIVE, the search criteria.</li>
+ * <li>arg[2]: long totalN, the total size, in elements, of the data set to parse.</li>
+ * <li>arg[3]: long tgtPartitionSize, the target number of elements per resulting partition.</li>
+ * <li>arg[4]: int maxPartsPerSk, the maximum number of partitions to be handled by any one sketch</li>
* </ul>
* @param args input arguments as defined above.
*/
public void main(String[] args) {
final int k, maxPartsPerSk;
final long totalN, tgtPartitionSize;
+ final QuantileSearchCriteria searchCrit;
try {
k = Integer.parseInt(args[0].trim());
- totalN = Long.parseLong(args[1].trim());
- tgtPartitionSize = Long.parseLong(args[2].trim());
- maxPartsPerSk = Integer.parseInt(args[3].trim());
+ searchCrit = args[1].trim().equalsIgnoreCase("INCLUSIVE") ? INCLUSIVE : EXCLUSIVE;
+ totalN = Long.parseLong(args[2].trim());
+ tgtPartitionSize = Long.parseLong(args[3].trim());
+ maxPartsPerSk = Integer.parseInt(args[4].trim());
} catch (NumberFormatException e) { throw new SketchesArgumentException(e.toString()); }
- kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
+ kllPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
}
//@Test //launch from TestNG
public void checkKllPartitioner() {
final int k = 1 << 15;
- final long totalN = 30_000_000L; //artificially set low so it will execute fast
+ final QuantileSearchCriteria searchCrit = INCLUSIVE;
+ final long totalN = 30_000_000L; //artificially set low so it will execute fast as a simple test
final long tgtPartitionSize = 3_000_000L;
final int maxPartsPerSk = 100;
- kllPartitioner(k, totalN, tgtPartitionSize, maxPartsPerSk);
+ kllPartitioner(k, searchCrit, totalN, tgtPartitionSize, maxPartsPerSk);
}
/**
* Programmatic call to KLL Partitioner
* @param k the size of the sketch.
+ * @param searchCrit the QuantileSearchCriteria: either INCLUSIVE or EXCLUSIVE.
* @param totalN the total size, in elements, of the data set to parse.
* @param tgtPartitionSize the target number of elements per resulting partition.
* @param maxPartsPerSk the maximum number of partitions to be handled by any one sketch.
*/
public void kllPartitioner(
final int k,
+ final QuantileSearchCriteria searchCrit,
final long totalN,
final long tgtPartitionSize,
final int maxPartsPerSk) {
@@ -92,7 +100,7 @@
tgtPartitionSize,
maxPartsPerSk,
fillReq,
- INCLUSIVE);
+ searchCrit);
final List<PartitionBoundsRow<String>> list = partitioner.partition(sk);
final long endTime_mS = System.currentTimeMillis();
final long fillInitialSketchTime_mS = endFillInitialSketchTime_mS - startTime_mS;
@@ -102,6 +110,7 @@
"KLL",
list,
k,
+ searchCrit,
totalN,
tgtPartitionSize,
maxPartsPerSk,
diff --git a/src/test/java/org/apache/datasketches/partitions/PartitionResults.java b/src/test/java/org/apache/datasketches/partitions/PartitionResults.java
index 501820d..b061ce9 100644
--- a/src/test/java/org/apache/datasketches/partitions/PartitionResults.java
+++ b/src/test/java/org/apache/datasketches/partitions/PartitionResults.java
@@ -27,6 +27,7 @@
import java.util.List;
import org.apache.datasketches.partitions.Partitioner.PartitionBoundsRow;
+import org.apache.datasketches.quantilescommon.QuantileSearchCriteria;
/**
* Output partitioning results to console.
@@ -42,6 +43,7 @@
final String sketchType,
final List<PartitionBoundsRow<String>> list,
final int k,
+ final QuantileSearchCriteria searchCrit,
final long totalN,
final long tgtPartitionSize,
final int maxPartsPerSk,
@@ -75,6 +77,7 @@
println(LS + sketchType +" ItemsSketch Partitions Test");
println(LS + "INPUT:");
printf("Sketch K :%,20d\n", k);
+ printf("Search Criteria :%20s\n", searchCrit.name());
printf("Total N :%,20d\n", totalN);
printf("Tgt Partition Size :%,20d\n", tgtPartitionSize);
printf("Max Parts Per Sketch :%20d\n", maxPartsPerSk);