blob: dc09e5cca40b963f541b8395f37fb065802a9714 [file] [log] [blame]
-- This script covers the case where the group by key is a significant
-- percentage of the row.
register $PIGMIX_JAR
A = load '$HDFS_ROOT/page_views' using org.apache.pig.test.pigmix.udf.PigPerformanceLoader()
as (user, action, timespent, query_term, ip_addr, timestamp,
estimated_revenue, page_info, page_links);
B = foreach A generate user, action, (int)timespent as timespent, query_term, ip_addr, timestamp;
C = group B by (user, query_term, ip_addr, timestamp) parallel $PARALLEL;
D = foreach C generate flatten(group), SUM(B.timespent);
store D into '$PIGMIX_OUTPUT/L6out';