Merge pull request #43 from DataSketches/hll-udfs

HLL sketch UDFs
diff --git a/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java b/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java
index 216d424..4677ad1 100644
--- a/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/frequencies/package-info.java
@@ -4,16 +4,12 @@
  */
 
 /**
- * <p>This package is dedicated to streaming algorithms that enable estimation of the
- * frequency of occurence of items in a weighted multiset stream of items.
- * If the frequency distribution of items is sufficiently skewed, these algorithms are very
- * useful in identifying the "Heavy Hitters" that occured most frequently in the stream.
- * The accuracy of the estimation of the frequency of an item has well understood error
- * bounds that can be returned by the sketch.</p>
+ * Pig UDFs for Frequent Items sketch.
+ * This includes generic implementation in the form of abstract classes DataToFrequentItemsSketch
+ * and UnionFrequentItemsSketch to be specialized for particular types of items.
+ * An implementation for strings is provided: DataToFrequentStringsSketch and UnionFrequentStringsSketch.
+ * FrequentStringsSketchToEstimates is to obtain results from sketches.
  *
- * <p>These sketches are mergable and can be serialized and deserialized to/from a compact
- * form.</p>
- *
- * @author Lee Rhodes
+ * @author Alexander Saydakov
  */
 package com.yahoo.sketches.pig.frequencies;
diff --git a/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java b/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java
index 19d4733..3de17d8 100644
--- a/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/quantiles/package-info.java
@@ -4,10 +4,18 @@
  */
 
 /**
- * <p>The quantiles package contains stochastic streaming algorithms that enable single-pass 
- * analysis of the distribution of a stream of real (double) values or generic items. 
- * </p>
+ * Hive UDFs for Quantiles sketches.
+ * This includes UDFs for generic ItemsSketch and specialized DoublesSketch.
  * 
- * @author Lee Rhodes
+ * <p>The generic implementation is in the form of abstract classes DataToItemsSketch and
+ * UnionItemsSketch to be specialized for particular types of items.
+ * An implementation for strings is provided: DataToStringsSketch, UnionStringsSketch,
+ * plus UDFs to obtain the results from sketches:
+ * GetQuantileFromStringsSketch, GetQuantilesFromStringsSketch and GetPmfFromStringsSketch.
+ * 
+ * <p>Support for DoublesSketch: DataToDoublesSketch, UnionDoublesSketch,
+ * GetQuantileFromDoublesSketch, GetQuantilesFromDoublesSketch, GetPmfFromDoublesSketch
+ *
+ * @author Alexander Saydakov
  */
 package com.yahoo.sketches.pig.quantiles;
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java b/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java
index 7ac84ec..683d229 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/AexcludeB.java
@@ -27,7 +27,7 @@
 
 /**
  * This is a Pig UDF that performs the A-NOT-B Set Operation on two given Sketches. Because this
- * operation is fundamentally asymetric, it is structured as a single stateless operation rather
+ * operation is fundamentally asymmetric, it is structured as a single stateless operation rather
  * than stateful as are Union and Intersection UDFs, which can be iterative.
  * The requirement to perform iterative A\B\C\... is rare. If needed, it can be rendered easily by
  * the caller.
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java b/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java
index 6a05c64..58fd79b 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/SketchToString.java
@@ -10,9 +10,7 @@
 import java.io.IOException;
 
 import org.apache.pig.EvalFunc;
-import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 import com.yahoo.sketches.Util;
 import com.yahoo.sketches.theta.Sketch;
@@ -80,22 +78,4 @@
     return sketch.toString(true, detailOut, 8, true);
   }
 
-  /**
-   * The output is a String Tuple.
-   */
-  @Override
-  public Schema outputSchema(final Schema input) {
-    if (input != null) {
-      try {
-        final Schema tupleSchema = new Schema();
-        tupleSchema.add(new Schema.FieldSchema("PrettyString", DataType.CHARARRAY));
-        return new Schema(new Schema.FieldSchema(getSchemaName(this
-            .getClass().getName().toLowerCase(), input), tupleSchema, DataType.TUPLE));
-      }
-      catch (final Exception e) {
-        // fall through
-      }
-    }
-    return null;
-  }
 }
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/Union.java b/src/main/java/com/yahoo/sketches/pig/theta/Union.java
index 13edb8f..b669eb0 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/Union.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/Union.java
@@ -29,7 +29,6 @@
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 import com.yahoo.memory.Memory;
-import com.yahoo.memory.WritableMemory;
 import com.yahoo.sketches.Util;
 import com.yahoo.sketches.theta.CompactSketch;
 import com.yahoo.sketches.theta.SetOperation;
@@ -298,7 +297,7 @@
       if (type == DataType.BYTEARRAY) {
         final DataByteArray dba = (DataByteArray) f0;
         if (dba.size() > 0) {
-          union.update(WritableMemory.wrap(dba.get()));
+          union.update(Memory.wrap(dba.get()));
         }
       } else {
         throw new IllegalArgumentException("Field type was not DataType.BYTEARRAY: " + type);
diff --git a/src/main/java/com/yahoo/sketches/pig/theta/package-info.java b/src/main/java/com/yahoo/sketches/pig/theta/package-info.java
index 0e31f51..6e95a6f 100644
--- a/src/main/java/com/yahoo/sketches/pig/theta/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/theta/package-info.java
@@ -4,12 +4,7 @@
  */
 
 /**
- * <p>The theta package contains all the sketch classes that are members of the 
- * <a href="{@docRoot}/resources/dictionary.html#thetaSketch">Theta Sketch Framework</a>.  
- * The basic sketching functionality in this package is also 
- * accessible from Hadoop Pig UDFs found in the <i>sketches-pig</i> repository, 
- * and from Hadoop Hive UADFs and UDFs found in the <i>sketches-hive</i> repository.
- * </p>
+ * Pig UDFs for Theta sketch.
  * 
  * @author Lee Rhodes
  */
diff --git a/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java b/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java
index 8d69698..0687e55 100644
--- a/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java
+++ b/src/main/java/com/yahoo/sketches/pig/tuple/package-info.java
@@ -3,13 +3,25 @@
  * at the project root for terms.
  */
 /**
- * The tuple package contains implementation of sketches based on the idea of
- * theta sketches with the addition of values associated with unique keys.
+ * Pig UDFs for Tuple sketches.
+ * Tuple sketches are based on the idea of Theta sketches with the addition of
+ * values associated with unique keys.
  * Two sets of tuple sketch classes are available at the moment:
- * generic tuple sketches with user-defined Summary, and a faster specialized
+ * generic Tuple sketches with user-defined Summary, and a faster specialized
  * implementation with an array of double values.
- * See unit tests for usage examples.
  *
+ * <p>There are two sets of Pig UDFs: one for generic Tuple sketch with an example
+ * implementation for DoubleSummay, and another one for a specialized ArrayOfDoublesSketch.
+ * 
+ * <p> The generic implementation is in the form of abstract classes DataToSketch and
+ * UnionSketch to be specialized for particular types of Summary.
+ * An example implementation for DoubleSumamry is provided: DataToDoubleSummarySketch and
+ * UnionDoubleSummarySketch, as well as UDFs to obtain the results from sketches:
+ * DoubleSumamrySketchToEstimates and DoubleSummarySketchToPercentile.
+ * 
+ * <p>UDFs for ArrayOfDoublesSketch: DataToArrayOfDoublesSketch, UnionArrayOfDoublesSketch,
+ * ArrayOfDoublesSketchToEstimates.
+ * 
  * @author Alexander Saydakov
  */
 package com.yahoo.sketches.pig.tuple;
diff --git a/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java b/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java
index 8f6c14d..8a93357 100644
--- a/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java
+++ b/src/test/java/com/yahoo/sketches/pig/theta/SketchToStringTest.java
@@ -5,7 +5,6 @@
 package com.yahoo.sketches.pig.theta;
 
 import static com.yahoo.sketches.Util.DEFAULT_UPDATE_SEED;
-import static com.yahoo.sketches.pig.PigTestingUtil.LS;
 import static com.yahoo.sketches.pig.PigTestingUtil.createDbaFromQssRange;
 import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertNotNull;
@@ -15,11 +14,8 @@
 import java.io.IOException;
 
 import org.apache.pig.EvalFunc;
-import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.testng.Assert;
 import org.testng.annotations.Test;
 
 import com.yahoo.sketches.pig.theta.SketchToString;
@@ -81,58 +77,6 @@
   }
   
   @Test
-  public void outputSchemaTest() throws IOException {
-    EvalFunc<String> udf = new SketchToString();
-    
-    Schema inputSchema = null;
-    Schema.FieldSchema inputFieldSchema = new Schema.FieldSchema("Sketch", DataType.BYTEARRAY);
-    
-    Schema nullOutputSchema = null;
-    
-    Schema outputSchema = null;
-    Schema.FieldSchema outputOuterFs0 = null;
-    
-    Schema outputInnerSchema = null;
-    Schema.FieldSchema outputInnerFs0 = null;
-    
-    inputSchema = new Schema(inputFieldSchema);
-    
-    nullOutputSchema = udf.outputSchema(null);
-    
-    outputSchema = udf.outputSchema(inputSchema);
-    outputOuterFs0 = outputSchema.getField(0);
-    
-    outputInnerSchema = outputOuterFs0.schema;
-    outputInnerFs0 = outputInnerSchema.getField(0);
-    
-    Assert.assertNull(nullOutputSchema, "Should be null");
-    Assert.assertNotNull(outputOuterFs0, "outputSchema.getField(0) schema may not be null");
-    
-    String expected = "tuple";
-    String result = DataType.findTypeName(outputOuterFs0.type);
-    Assert.assertEquals(result, expected);
-    
-    expected = "chararray";
-    Assert.assertNotNull(outputInnerFs0, "innerSchema.getField(0) schema may not be null");
-    result = DataType.findTypeName(outputInnerFs0.type);
-    Assert.assertEquals(result, expected);
-    
-    //print schemas
-    //@formatter:off
-    StringBuilder sb = new StringBuilder();
-    sb.append("input schema: ").append(inputSchema).append(LS)
-      .append("output schema: ").append(outputSchema).append(LS)
-      .append("outputOuterFs: ").append(outputOuterFs0)
-        .append(", type: ").append(DataType.findTypeName(outputOuterFs0.type)).append(LS)
-      .append("outputInnerSchema: ").append(outputInnerSchema).append(LS)
-      .append("outputInnerFs0: ").append(outputInnerFs0)
-        .append(", type: ").append(DataType.findTypeName(outputInnerFs0.type)).append(LS);
-    println(sb.toString());
-    //@formatter:on
-    //end print schemas
-  }
-  
-  @Test
   public void printlnTest() {
     println(this.getClass().getSimpleName());
   }