Merge pull request #43 from DataSketches/hll-udfs
HLL sketch UDFs
diff --git a/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java b/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java
index 216d424..4677ad1 100644
--- a/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java
@@ -4,16 +4,12 @@
*/
/**
- * <p>This package is dedicated to streaming algorithms that enable estimation of the
- * frequency of occurence of items in a weighted multiset stream of items.
- * If the frequency distribution of items is sufficiently skewed, these algorithms are very
- * useful in identifying the "Heavy Hitters" that occured most frequently in the stream.
- * The accuracy of the estimation of the frequency of an item has well understood error
- * bounds that can be returned by the sketch.</p>
+ * Pig UDFs for Frequent Items sketch.
+ * This includes generic implementation in the form of abstract classes DataToFrequentItemsSketch
+ * and UnionFrequentItemsSketch to be specialized for particular types of items.
+ * An implementation for strings is provided: DataToFrequentStringsSketch and UnionFrequentStringsSketch.
+ * FrequentStringsSketchToEstimates is to obtain results from sketches.
*
- * <p>These sketches are mergable and can be serialized and deserialized to/from a compact
- * form.</p>
- *
- * @author Lee Rhodes
+ * @author Alexander Saydakov
*/
package com.yahoo.sketches.pig.frequencies;
diff --git a/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java b/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java
index 19d4733..3de17d8 100644
--- a/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java
@@ -4,10 +4,18 @@
*/
/**
- * <p>The quantiles package contains stochastic streaming algorithms that enable single-pass
- * analysis of the distribution of a stream of real (double) values or generic items.
- * </p>
+ * Hive UDFs for Quantiles sketches.
+ * This includes UDFs for generic ItemsSketch and specialized DoublesSketch.
*
- * @author Lee Rhodes
+ * <p>The generic implementation is in the form of abstract classes DataToItemsSketch and
+ * UnionItemsSketch to be specialized for particular types of items.
+ * An implementation for strings is provided: DataToStringsSketch, UnionStringsSketch,
+ * plus UDFs to obtain the results from sketches:
+ * GetQuantileFromStringsSketch, GetQuantilesFromStringsSketch and GetPmfFromStringsSketch.
+ *
+ * <p>Support for DoublesSketch: DataToDoublesSketch, UnionDoublesSketch,
+ * GetQuantileFromDoublesSketch, GetQuantilesFromDoublesSketch, GetPmfFromDoublesSketch
+ *
+ * @author Alexander Saydakov
*/
package com.yahoo.sketches.pig.quantiles;
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java b/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java
index 7ac84ec..683d229 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java
@@ -27,7 +27,7 @@
/**
* This is a Pig UDF that performs the A-NOT-B Set Operation on two given Sketches. Because this
- * operation is fundamentally asymetric, it is structured as a single stateless operation rather
+ * operation is fundamentally asymmetric, it is structured as a single stateless operation rather
* than stateful as are Union and Intersection UDFs, which can be iterative.
* The requirement to perform iterative A\B\C\... is rare. If needed, it can be rendered easily by
* the caller.
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java b/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java
index 6a05c64..58fd79b 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java
@@ -10,9 +10,7 @@
import java.io.IOException;
import org.apache.pig.EvalFunc;
-import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
import com.yahoo.sketches.Util;
import com.yahoo.sketches.theta.Sketch;
@@ -80,22 +78,4 @@
return sketch.toString(true, detailOut, 8, true);
}
- /**
- * The output is a String Tuple.
- */
- @Override
- public Schema outputSchema(final Schema input) {
- if (input != null) {
- try {
- final Schema tupleSchema = new Schema();
- tupleSchema.add(new Schema.FieldSchema("PrettyString", DataType.CHARARRAY));
- return new Schema(new Schema.FieldSchema(getSchemaName(this
- .getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE));
- }
- catch (final Exception e) {
- // fall through
- }
- }
- return null;
- }
}
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/Union.java b/src/main/java/com/yahoo/sketches/pig/theta/Union.java
index 13edb8f..b669eb0 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/Union.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/Union.java
@@ -29,7 +29,6 @@
import org.apache.pig.impl.logicalLayer.schema.Schema;
import com.yahoo.memory.Memory;
-import com.yahoo.memory.WritableMemory;
import com.yahoo.sketches.Util;
import com.yahoo.sketches.theta.CompactSketch;
import com.yahoo.sketches.theta.SetOperation;
@@ -298,7 +297,7 @@
if (type == DataType.BYTEARRAY) {
final DataByteArray dba = (DataByteArray) f0;
if (dba.size() > 0) {
- union.update(WritableMemory.wrap(dba.get()));
+ union.update(Memory.wrap(dba.get()));
}
} else {
throw new IllegalArgumentException("Field type was not DataType.BYTEARRAY: " + type);
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/package-info.java b/src/main/java/com/yahoo/sketches/pig/theta/package-info.java
index 0e31f51..6e95a6f 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/package-info.java
@@ -4,12 +4,7 @@
*/
/**
- * <p>The theta package contains all the sketch classes that are members of the
- * <a href="{@docRoot}/resources/dictionary.html#thetaSketch">Theta Sketch Framework</a>.
- * The basic sketching functionality in this package is also
- * accessible from Hadoop Pig UDFs found in the <i>sketches-pig</i> repository,
- * and from Hadoop Hive UADFs and UDFs found in the <i>sketches-hive</i> repository.
- * </p>
+ * Pig UDFs for Theta sketch.
*
* @author Lee Rhodes
*/
diff --git a/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java b/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java
index 8d69698..0687e55 100644
--- a/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java
@@ -3,13 +3,25 @@
* at the project root for terms.
*/
/**
- * The tuple package contains implementation of sketches based on the idea of
- * theta sketches with the addition of values associated with unique keys.
+ * Pig UDFs for Tuple sketches.
+ * Tuple sketches are based on the idea of Theta sketches with the addition of
+ * values associated with unique keys.
* Two sets of tuple sketch classes are available at the moment:
- * generic tuple sketches with user-defined Summary, and a faster specialized
+ * generic Tuple sketches with user-defined Summary, and a faster specialized
* implementation with an array of double values.
- * See unit tests for usage examples.
*
+ * <p>There are two sets of Pig UDFs: one for generic Tuple sketch with an example
+ * implementation for DoubleSummay, and another one for a specialized ArrayOfDoublesSketch.
+ *
+ * <p> The generic implementation is in the form of abstract classes DataToSketch and
+ * UnionSketch to be specialized for particular types of Summary.
+ * An example implementation for DoubleSumamry is provided: DataToDoubleSummarySketch and
+ * UnionDoubleSummarySketch, as well as UDFs to obtain the results from sketches:
+ * DoubleSumamrySketchToEstimates and DoubleSummarySketchToPercentile.
+ *
+ * <p>UDFs for ArrayOfDoublesSketch: DataToArrayOfDoublesSketch, UnionArrayOfDoublesSketch,
+ * ArrayOfDoublesSketchToEstimates.
+ *
* @author Alexander Saydakov
*/
package com.yahoo.sketches.pig.tuple;
diff --git a/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java b/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java
index 8f6c14d..8a93357 100644
--- a/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java
+++ b/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java
@@ -5,7 +5,6 @@
package com.yahoo.sketches.pig.theta;
import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
-import static com.yahoo.sketches.pig.PigTestingUtil.LS;
import static com.yahoo.sketches.pig.PigTestingUtil.createDbaFromQssRange;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertNotNull;
@@ -15,11 +14,8 @@
import java.io.IOException;
import org.apache.pig.EvalFunc;
-import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.testng.Assert;
import org.testng.annotations.Test;
import com.yahoo.sketches.pig.theta.SketchToString;
@@ -81,58 +77,6 @@
}
@Test
- public void outputSchemaTest() throws IOException {
- EvalFunc<String> udf = new SketchToString();
-
- Schema inputSchema = null;
- Schema.FieldSchema inputFieldSchema = new Schema.FieldSchema("Sketch", DataType.BYTEARRAY);
-
- Schema nullOutputSchema = null;
-
- Schema outputSchema = null;
- Schema.FieldSchema outputOuterFs0 = null;
-
- Schema outputInnerSchema = null;
- Schema.FieldSchema outputInnerFs0 = null;
-
- inputSchema = new Schema(inputFieldSchema);
-
- nullOutputSchema = udf.outputSchema(null);
-
- outputSchema = udf.outputSchema(inputSchema);
- outputOuterFs0 = outputSchema.getField(0);
-
- outputInnerSchema = outputOuterFs0.schema;
- outputInnerFs0 = outputInnerSchema.getField(0);
-
- Assert.assertNull(nullOutputSchema, "Should be null");
- Assert.assertNotNull(outputOuterFs0, "outputSchema.getField(0) schema may not be null");
-
- String expected = "tuple";
- String result = DataType.findTypeName(outputOuterFs0.type);
- Assert.assertEquals(result, expected);
-
- expected = "chararray";
- Assert.assertNotNull(outputInnerFs0, "innerSchema.getField(0) schema may not be null");
- result = DataType.findTypeName(outputInnerFs0.type);
- Assert.assertEquals(result, expected);
-
- //print schemas
- //@formatter:off
- StringBuilder sb = new StringBuilder();
- sb.append("input schema: ").append(inputSchema).append(LS)
- .append("output schema: ").append(outputSchema).append(LS)
- .append("outputOuterFs: ").append(outputOuterFs0)
- .append(", type: ").append(DataType.findTypeName(outputOuterFs0.type)).append(LS)
- .append("outputInnerSchema: ").append(outputInnerSchema).append(LS)
- .append("outputInnerFs0: ").append(outputInnerFs0)
- .append(", type: ").append(DataType.findTypeName(outputInnerFs0.type)).append(LS);
- println(sb.toString());
- //@formatter:on
- //end print schemas
- }
-
- @Test
public void printlnTest() {
println(this.getClass().getSimpleName());
}