blob: 89de0a536d05b15a7980ae23b4926d6bdb63ed13 [file] [log] [blame] [view]
---
layout: doc_page
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## Theta Sketch Pig UDFs
### Instructions
* get jars
* save the following script as theta.pig
* adjust jar versions and paths if necessary
* save the below data into a file called "data.txt"
* copy data to hdfs: "hdfs dfs -copyFromLocal data.txt"
* run pig script: "pig theta.pig"
### theta.pig script: building sketches, merging sketches and getting estimates
register datasketches-memory-2.0.0.jar;
register datasketches-java-3.1.0.jar;
register datasketches-pig-1.1.0.jar;
define dataToSketch org.apache.datasketches.pig.theta.DataToSketch('32');
define unionSketch org.apache.datasketches.pig.theta.Union('32');
define getEstimate org.apache.datasketches.pig.theta.Estimate();
a = load 'data.txt' as (id, category);
b = group a by category;
c = foreach b generate flatten(group) as (category), flatten(dataToSketch(a.id)) as (sketch);
-- Sketches can be stored at this point in binary format to be used later:
-- store c into 'intermediate/$date' using BinStorage();
-- The next two lines print the results in human readable form for the purpose of this example
d = foreach c generate category, getEstimate(sketch);
dump d;
-- This can be a separate query
-- For example, the first part can produce a daily intermediate feed and store it,
-- and this part can load several instances of this daily intermediate feed and merge them
-- c = load 'intermediate/$date1,intermediate/$date2' using BinStorage() as (category, sketch);
e = group c all;
f = foreach e generate flatten(unionSketch(c.sketch)) as (sketch);
g = foreach f generate getEstimate(sketch);
dump g;
### [data.txt]({{site.docs_dir}}/Theta/data.txt) (tab separated)
The example input data has 2 fields: id and category.
There are 2 categories 'a' and 'b' with 50 unique IDs in each.
Most of the IDs in these categories overlap, so that there are 60 unique IDs in total.
Results:
From 'dump d':
(a,46.91487058420659)
(b,46.23988568048073)
From 'dump g' (merged across categories):
(50.415577215639736)
The expected exact result would be (60.0). The estimate has high relative error because the sketch was configured with only 32 nominal entries
to show the estimation mode for the purposes of this example.
### theta_setops.pig script: set operations on sketches
register datasketches-memory-2.0.0-incubating.jar;
register datasketches-java-3.1.0.jar;
register datasketches-pig-1.1.0.jar;
define dataToSketch org.apache.datasketches.pig.theta.DataToSketch('32');
define unionSketch org.apache.datasketches.pig.theta.Union();
define intersect org.apache.datasketches.pig.theta.Intersect();
define anotb org.apache.datasketches.pig.theta.AexcludeB();
define estimate org.apache.datasketches.pig.theta.Estimate();
a = load 'setops_data.txt' as (id1, id2);
b = group a all;
c = foreach b generate
flatten(dataToSketch(a.id1)) as (sketch1),
flatten(dataToSketch(a.id2)) as (sketch2);
d = foreach c generate
sketch1, -- pass sketches through to have all estimates in one place
sketch2,
flatten(unionSketch(TOBAG(sketch1, sketch2))) as (a_union_b),
flatten(intersect(TOBAG(sketch1, sketch2))) as (a_intersect_b),
flatten(anotb(sketch1, sketch2)) as (a_not_b),
flatten(anotb(sketch2, sketch1)) as (b_not_a);
e = foreach d generate
estimate(sketch1),
estimate(sketch2),
estimate(a_union_b),
estimate(a_intersect_b),
estimate(a_not_b),
estimate(b_not_a);
dump e;
### [setops_data.txt]({{site.docs_dir}}/Theta/setops_data.txt) (tab separated)
Result:
(10.0,12.0,18.0,4.0,6.0,8.0)