Merge branch 'master' into 1.2.X-incubating
diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
index e32b735..a50d920 100644
--- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
@@ -172,11 +172,6 @@
count = mem.getInt(offset);
offset += Integer.BYTES;
}
- // if (version == serialVersionWithSummaryFactoryUID) {
- // final DeserializeResult<SummaryFactory<S>> factoryResult =
- // SerializerDeserializer.deserializeFromMemory(mem, offset);
- // offset += factoryResult.getSize();
- // }
final int currentCapacity = 1 << lgCurrentCapacity_;
keys_ = new long[currentCapacity];
for (int i = 0; i < count; i++) {
diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java
index 6b024c1..7800b55 100644
--- a/src/main/java/org/apache/datasketches/tuple/Union.java
+++ b/src/main/java/org/apache/datasketches/tuple/Union.java
@@ -19,8 +19,13 @@
package org.apache.datasketches.tuple;
+import static java.lang.Math.min;
import static org.apache.datasketches.Util.DEFAULT_NOMINAL_ENTRIES;
+import java.lang.reflect.Array;
+
+import org.apache.datasketches.QuickSelect;
+
/**
* Compute a union of two or more tuple sketches.
* A new instance represents an empty set.
@@ -29,10 +34,10 @@
* @param <S> Type of Summary
*/
public class Union<S extends Summary> {
- private final int nomEntries_;
private final SummarySetOperations<S> summarySetOps_;
private QuickSelectSketch<S> sketch_;
private long theta_; // need to maintain outside of the sketch
+ private boolean isEmpty_;
/**
* Creates new instance with default nominal entries
@@ -49,10 +54,10 @@
* @param summarySetOps instance of SummarySetOperations
*/
public Union(final int nomEntries, final SummarySetOperations<S> summarySetOps) {
- nomEntries_ = nomEntries;
summarySetOps_ = summarySetOps;
- sketch_ = new QuickSelectSketch<S>(nomEntries, null);
+ sketch_ = new QuickSelectSketch<>(nomEntries, null);
theta_ = sketch_.getThetaLong();
+ isEmpty_ = true;
}
/**
@@ -60,31 +65,72 @@
* @param sketchIn input sketch to add to the internal set
*/
public void update(final Sketch<S> sketchIn) {
- if (sketchIn == null || sketchIn.isEmpty()) { return; }
+ if ((sketchIn == null) || sketchIn.isEmpty()) { return; }
+ isEmpty_ = false;
if (sketchIn.theta_ < theta_) { theta_ = sketchIn.theta_; }
final SketchIterator<S> it = sketchIn.iterator();
while (it.next()) {
sketch_.merge(it.getKey(), it.getSummary(), summarySetOps_);
}
+ if (sketch_.theta_ < theta_) {
+ theta_ = sketch_.theta_;
+ }
}
/**
* Gets the internal set as a CompactSketch
* @return result of the unions so far
*/
+ @SuppressWarnings("unchecked")
public CompactSketch<S> getResult() {
- sketch_.trim();
- if (theta_ < sketch_.theta_) {
- sketch_.setThetaLong(theta_);
- sketch_.rebuild();
+ if (isEmpty_) {
+ return sketch_.compact();
}
- return sketch_.compact();
+ if ((theta_ >= sketch_.theta_) && (sketch_.getRetainedEntries() <= sketch_.getNominalEntries())) {
+ return sketch_.compact();
+ }
+ long theta = min(theta_, sketch_.theta_);
+
+ int num = 0;
+ {
+ final SketchIterator<S> it = sketch_.iterator();
+ while (it.next()) {
+ if (it.getKey() < theta) { num++; }
+ }
+ }
+ if (num == 0) {
+ return new CompactSketch<>(null, null, theta, isEmpty_);
+ }
+ if (num > sketch_.getNominalEntries()) {
+ final long[] keys = new long[num]; // temporary since the order will be destroyed by quick select
+ final SketchIterator<S> it = sketch_.iterator();
+ int i = 0;
+ while (it.next()) {
+ if (it.getKey() < theta) { keys[i++] = it.getKey(); }
+ }
+ theta = QuickSelect.select(keys, 0, num - 1, sketch_.getNominalEntries());
+ num = sketch_.getNominalEntries();
+ }
+ final long[] keys = new long[num];
+ final S[] summaries = (S[]) Array.newInstance(sketch_.summaries_.getClass().getComponentType(), num);
+ final SketchIterator<S> it = sketch_.iterator();
+ int i = 0;
+ while (it.next()) {
+ if (it.getKey() < theta) {
+ keys[i] = it.getKey();
+ summaries[i] = (S) it.getSummary().copy();
+ i++;
+ }
+ }
+ return new CompactSketch<>(keys, summaries, theta, isEmpty_);
}
/**
* Resets the internal set to the initial state, which represents an empty set
*/
public void reset() {
- sketch_ = new QuickSelectSketch<S>(nomEntries_, null);
+ sketch_.reset();
+ theta_ = sketch_.getThetaLong();
+ isEmpty_ = true;
}
}
diff --git a/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java b/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java
index dd07d42..174e282 100644
--- a/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java
@@ -55,8 +55,8 @@
* <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability</a>
* @param summaryFactory An instance of a SummaryFactory.
*/
- public UpdatableSketch(final int nomEntries, final int lgResizeFactor, final float samplingProbability,
- final SummaryFactory<S> summaryFactory) {
+ public UpdatableSketch(final int nomEntries, final int lgResizeFactor,
+ final float samplingProbability, final SummaryFactory<S> summaryFactory) {
super(nomEntries, lgResizeFactor, samplingProbability, summaryFactory);
}
diff --git a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java
index 57cc8e6..92e2cbe 100644
--- a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java
@@ -35,7 +35,26 @@
* @param mode The DoubleSummary mode to be used
*/
public DoubleSketch(final int lgK, final DoubleSummary.Mode mode) {
- super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new DoubleSummaryFactory(mode));
+ this(lgK, ResizeFactor.X8.ordinal(), 1.0F, mode);
+ }
+
+ /**
+ * Creates this sketch with the following parameters:
+ * @param lgK Log_base2 of <i>Nominal Entries</i>.
+ * @param lgResizeFactor log2(resizeFactor) - value from 0 to 3:
+ * <pre>
+ * 0 - no resizing (max size allocated),
+ * 1 - double internal hash table each time it reaches a threshold
+ * 2 - grow four times
+ * 3 - grow eight times (default)
+ * </pre>
+ * @param samplingProbability
+ * <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability</a>
+ * @param mode The DoubleSummary mode to be used
+ */
+ public DoubleSketch(final int lgK, final int lgResizeFactor, final float samplingProbability,
+ final DoubleSummary.Mode mode) {
+ super(1 << lgK, lgResizeFactor, samplingProbability, new DoubleSummaryFactory(mode));
}
/**
diff --git a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java
index 9d75912..03ca7d0 100644
--- a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java
@@ -35,7 +35,26 @@
* @param mode The IntegerSummary mode to be used
*/
public IntegerSketch(final int lgK, final IntegerSummary.Mode mode) {
- super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new IntegerSummaryFactory(mode));
+ this(lgK, ResizeFactor.X8.ordinal(), 1.0F, mode);
+ }
+
+ /**
+ * Creates this sketch with the following parameters:
+ * @param lgK Log_base2 of <i>Nominal Entries</i>.
+ * @param lgResizeFactor log2(resizeFactor) - value from 0 to 3:
+ * <pre>
+ * 0 - no resizing (max size allocated),
+ * 1 - double internal hash table each time it reaches a threshold
+ * 2 - grow four times
+ * 3 - grow eight times (default)
+ * </pre>
+ * @param samplingProbability
+ * <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability</a>
+ * @param mode The IntegerSummary mode to be used
+ */
+ public IntegerSketch(final int lgK, final int lgResizeFactor, final float samplingProbability,
+ final IntegerSummary.Mode mode) {
+ super(1 << lgK, lgResizeFactor, samplingProbability, new IntegerSummaryFactory(mode));
}
/**
diff --git a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
index 8e1aefa..cfc4dcf 100644
--- a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
+++ b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
@@ -230,13 +230,6 @@
Assert.assertEquals(sketch.getEstimate(), 6.0);
}
-// @Test
-// public void updateDoubleSummary() {
-// DoubleSummary ds = new DoubleSummary();
-// ds.update(1.0);
-// Assert.assertEquals(ds.getValue(), 1.0);
-// }
-
@Test
public void doubleSummaryDefaultSumMode() {
UpdatableSketch<Double, DoubleSummary> sketch =
@@ -403,6 +396,22 @@
}
@Test
+ public void unionEmptySampling() {
+ UpdatableSketch<Double, DoubleSummary> sketch =
+ new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).setSamplingProbability(0.01f).build();
+ sketch.update(1, 1.0);
+ Assert.assertEquals(sketch.getRetainedEntries(), 0); // not retained due to low sampling probability
+
+ Union<DoubleSummary> union = new Union<>(new DoubleSummarySetOperations(mode));
+ union.update(sketch);
+ CompactSketch<DoubleSummary> result = union.getResult();
+ Assert.assertEquals(result.getRetainedEntries(), 0);
+ Assert.assertFalse(result.isEmpty());
+ Assert.assertTrue(result.isEstimationMode());
+ Assert.assertEquals(result.getEstimate(), 0.0);
+ }
+
+ @Test
public void unionExactMode() {
UpdatableSketch<Double, DoubleSummary> sketch1 =
new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();