blob: fc5dc40796b4886691c74ceb511bd33057bfea4c [file] [log] [blame]
-- This script covers multi-store queries.
register $PIGMIX_JAR
A = load '$HDFS_ROOT/page_views' using org.apache.pig.test.pigmix.udf.PigPerformanceLoader()
as (user, action, timespent, query_term, ip_addr, timestamp,
estimated_revenue, page_info, page_links);
B = foreach A generate user, action, (int)timespent as timespent, query_term,
(double)estimated_revenue as estimated_revenue;
split B into C if user is not null, alpha if user is null;
split C into D if query_term is not null, aleph if query_term is null;
E = group D by user parallel $PARALLEL;
F = foreach E generate group, MAX(D.estimated_revenue);
store F into '$PIGMIX_OUTPUT/highest_value_page_per_user';
beta = group alpha by query_term parallel $PARALLEL;
gamma = foreach beta generate group, SUM(alpha.timespent);
store gamma into '$PIGMIX_OUTPUT/total_timespent_per_term';
beth = group aleph by action parallel $PARALLEL;
gimel = foreach beth generate group, COUNT(aleph);
store gimel into '$PIGMIX_OUTPUT/queries_per_action';