blob: ce62d666f5d7b5d36964324492525228c0ae7aa0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
REGISTER '../../datafu-pig/build/libs/datafu-pig-1.2.1.jar';
DEFINE Sessionize datafu.pig.sessions.Sessionize('10m');
DEFINE Median datafu.pig.stats.Median();
DEFINE Quantile datafu.pig.stats.StreamingQuantile('0.75','0.90','0.95');
DEFINE VAR datafu.pig.stats.VAR();
pv = LOAD 'clicks.csv' USING PigStorage(',') AS (memberId:int, time:long, url:chararray);
pv = FOREACH pv
GENERATE time,
memberId;
pv_sessionized = FOREACH (GROUP pv BY memberId) {
ordered = ORDER pv BY time;
GENERATE FLATTEN(Sessionize(ordered)) AS (time, memberId, sessionId);
};
pv_sessionized = FOREACH pv_sessionized GENERATE sessionId, time;
-- compute length of each session in minutes
session_times = FOREACH (GROUP pv_sessionized BY sessionId)
GENERATE group as sessionId,
(MAX(pv_sessionized.time)-MIN(pv_sessionized.time))
/ 1000.0 / 60.0 as session_length;
-- compute stats on session length
session_stats = FOREACH (GROUP session_times ALL) {
ordered = ORDER session_times BY session_length;
GENERATE
AVG(ordered.session_length) as avg_session,
SQRT(VAR(ordered.session_length)) as std_dev_session,
Median(ordered.session_length) as median_session,
Quantile(ordered.session_length) as quantiles_session;
};
DUMP session_stats
--(15.737532575757575,31.29552045993877,(2.848041666666667),(14.648516666666666,31.88788333333333,86.69525))