blob: 735eaf6214fc4d2f7bef3c0d25bc5dc4fa93db0d [file] [log] [blame]
/* ----------------------------------------------------------------------- */
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @file sessionize.sql_in
*
* @brief SQL functions for sessionization functions
* @date May 2016
*
*/
/* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_sessionize
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#syntax">Function Syntax</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Session reconstruction of data
consisting of a time stampled sequence of events.
@details The MADlib sessionize function performs time-oriented session reconstruction
on a data set comprising a sequence of events. A defined period of inactivity
indicates the end of one session and beginning of the next session. Sessions can be
useful in many domains including web analytics [1], network security, manufacturing, finance,
and operational analytics.
@anchor syntax
@par Function Syntax
<pre class="syntax">
sessionize(
source_table,
output_table,
partition_expr,
time_stamp,
max_time,
output_cols,
create_view
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>VARCHAR. Name of the source table that contains the data to be sessionized.</dd>
<dt>output_table</dt>
<dd>VARCHAR. Name of the output view or table. (The parameter create_view described below
defines whether the output is actually a view or a table.) In addition to the columns in the
source_table, the output also contains a new column called session_id:
<ul>
<li>session_id=1,2,...n where n is the number of the session in the partition.</li>
</ul>
</dd>
<dt>partition_expr</dt>
<dd>VARCHAR. The 'partition_expr' is a single column or a list of
comma-separated columns/expressions to divide all rows into groups,
or partitions. Sessionization is applied across the rows that fall into
the same partition. This parameter can be set to NULL or '' to indicate
the sessionization operation is to be applied to the whole input table.</dd>
<dt>time_stamp</dt>
<dd>VARCHAR. The time stamp column name that is used for sessionization calculation.
Note that the time_stamp column will be sorted in ascending order before
the session reconstruction is done within a partition.</dd>
<dt>max_time</dt>
<dd>INTERVAL. Maximum delta time (i.e., time out) between subsequent events that define a session.
If the elapsed time between subsequent events is longer than max_time, a new session is created.</dd>
@anchor note
@note
Note that max_time is of time type INTERVAL which is a PostgreSQL way of describing elapsed
time. For more information on INTERVAL please refer to reference [2].
<dt>output_cols (optional)</dt>
<dd>VARCHAR. An optional comma separated list of columns to be written to the output_table.
Must be a valid SELECT expression. This is set to '*' by default, which means all columns in the
input table will be written to the output_table plus the session_id column.
Note that this parameter could include a list containing the partition_expr
or any other expressions of interest. E.g., '*, expr1, expr2, etc.' where this means
output all columns from the input table plus the expressions listed plus the session_id column.</dd>
<dt>create_view (optional)</dt>
<dd>BOOLEAN default: TRUE. Determines whether to create a view or materialize the output as a table.
If you only need session info once, creating a view could be significantly
faster than materializing as a table. Please note that if you set create_view to NULL
(allowed by PostgreSQL) it will get set to the default value of TRUE.</dd>
</dl>
@anchor examples
@examp
The data set describes shopper behavior on a notional web site that sells
beer and wine. A beacon fires an event to a log file when the shopper
visits different pages on the site: landing page, beer selection page,
wine selection page, and checkout. Each user is identified by a a user id,
and every time a page is visited, the page and time stamp are logged.
Create the data table:
<pre class="example">
DROP TABLE IF EXISTS eventlog CASCADE; -- Using CASCADE in case you are running through this example more than once (views used below)
CREATE TABLE eventlog (event_timestamp TIMESTAMP,
user_id INT,
page TEXT,
revenue FLOAT);
INSERT INTO eventlog VALUES
('04/15/2015 02:19:00', 101331, 'CHECKOUT', 16),
('04/15/2015 02:17:00', 202201, 'WINE', 0),
('04/15/2015 03:18:00', 202201, 'BEER', 0),
('04/15/2015 01:03:00', 100821, 'LANDING', 0),
('04/15/2015 01:04:00', 100821, 'WINE', 0),
('04/15/2015 01:05:00', 100821, 'CHECKOUT', 39),
('04/15/2015 02:06:00', 100821, 'WINE', 0),
('04/15/2015 02:09:00', 100821, 'WINE', 0),
('04/15/2015 02:15:00', 101331, 'LANDING', 0),
('04/15/2015 02:16:00', 101331, 'WINE', 0),
('04/15/2015 02:17:00', 101331, 'HELP', 0),
('04/15/2015 02:18:00', 101331, 'WINE', 0),
('04/15/2015 02:29:00', 201881, 'LANDING', 0),
('04/15/2015 02:30:00', 201881, 'BEER', 0),
('04/15/2015 01:05:00', 202201, 'LANDING', 0),
('04/15/2015 01:06:00', 202201, 'HELP', 0),
('04/15/2015 01:09:00', 202201, 'LANDING', 0),
('04/15/2015 02:15:00', 202201, 'WINE', 0),
('04/15/2015 02:16:00', 202201, 'BEER', 0),
('04/15/2015 03:19:00', 202201, 'WINE', 0),
('04/15/2015 03:22:00', 202201, 'CHECKOUT', 21);
</pre>
Sessionize the table by each user_id:
<pre class="example">
DROP VIEW IF EXISTS sessionize_output_view;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_view', -- View to store sessionize results
'user_id', -- Partition input table by user id
'event_timestamp', -- Time column used to compute sessions
'0:30:0' -- Use 30 minute time out to define sessions
);
SELECT * FROM sessionize_output_view ORDER BY user_id, event_timestamp;
</pre>
Result:
<pre class="result">
event_timestamp | user_id | page | revenue | session_id
---------------------+---------+----------+---------+------------
2015-04-15 01:03:00 | 100821 | LANDING | 0 | 1
2015-04-15 01:04:00 | 100821 | WINE | 0 | 1
2015-04-15 01:05:00 | 100821 | CHECKOUT | 39 | 1
2015-04-15 02:06:00 | 100821 | WINE | 0 | 2
2015-04-15 02:09:00 | 100821 | WINE | 0 | 2
2015-04-15 02:15:00 | 101331 | LANDING | 0 | 1
2015-04-15 02:16:00 | 101331 | WINE | 0 | 1
2015-04-15 02:17:00 | 101331 | HELP | 0 | 1
2015-04-15 02:18:00 | 101331 | WINE | 0 | 1
2015-04-15 02:19:00 | 101331 | CHECKOUT | 16 | 1
2015-04-15 02:29:00 | 201881 | LANDING | 0 | 1
2015-04-15 02:30:00 | 201881 | BEER | 0 | 1
2015-04-15 01:05:00 | 202201 | LANDING | 0 | 1
2015-04-15 01:06:00 | 202201 | HELP | 0 | 1
2015-04-15 01:09:00 | 202201 | LANDING | 0 | 1
2015-04-15 02:15:00 | 202201 | WINE | 0 | 2
2015-04-15 02:16:00 | 202201 | BEER | 0 | 2
2015-04-15 02:17:00 | 202201 | WINE | 0 | 2
2015-04-15 03:18:00 | 202201 | BEER | 0 | 3
2015-04-15 03:19:00 | 202201 | WINE | 0 | 3
2015-04-15 03:22:00 | 202201 | CHECKOUT | 21 | 3
(21 rows)
</pre>
Now let's say we want to see 3 minute sessions by a group of users
with a certain range of user IDs. To do this, we need to sessionize
the table based on a partition expression. Also, we want to persist
a table output with a reduced set of columns in the table.
<pre class="example">
DROP TABLE IF EXISTS sessionize_output_table;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_table', -- Table to store sessionize results
'user_id < 200000', -- Partition input table by subset of users
'event_timestamp', -- Order partitions in input table by time
'180', -- Use 180 second time out to define sessions (same as '0:03:0')
'event_timestamp, user_id, user_id < 200000 AS "Department-A1"', -- Select only user_id and event_timestamp columns, along with the session id as output
'f' -- create a table
);
SELECT * FROM sessionize_output_table WHERE "Department-A1"='TRUE' ORDER BY event_timestamp;
</pre>
Result showing 2 users and 3 total sessions across the group:
<pre class="result">
event_timestamp | user_id | Department-A1 | session_id
---------------------+---------+---------------+------------
2015-04-15 01:03:00 | 100821 | t | 1
2015-04-15 01:04:00 | 100821 | t | 1
2015-04-15 01:05:00 | 100821 | t | 1
2015-04-15 02:06:00 | 100821 | t | 2
2015-04-15 02:09:00 | 100821 | t | 2
2015-04-15 02:15:00 | 101331 | t | 3
2015-04-15 02:16:00 | 101331 | t | 3
2015-04-15 02:17:00 | 101331 | t | 3
2015-04-15 02:18:00 | 101331 | t | 3
2015-04-15 02:19:00 | 101331 | t | 3
(10 rows)
</pre>
@anchor literature
@par Literature
NOTE: The following PostgreSQL link refers to documentation resources
for the current PostgreSQL database version. Depending upon your
database platform version, you may need to change "current" reference
in the link to your database version.
If your database platform uses the Greenplum Database (or related
variants), please check with the project community and/or your
database vendor to identify the PostgreSQL version it is based on.
[1] Sesssions in web analytics
https://en.wikipedia.org/wiki/Session_(web_analytics)
[2] PostgreSQL date/time types
https://www.postgresql.org/docs/current/static/datatype-datetime.html
*/
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(
source_table VARCHAR,
output_table VARCHAR,
partition_expr VARCHAR,
time_stamp VARCHAR,
max_time INTERVAL,
output_cols VARCHAR,
create_view BOOLEAN
) RETURNS void AS $$
PythonFunction(utilities, sessionize, sessionize)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(
source_table VARCHAR,
output_table VARCHAR,
partition_expr VARCHAR,
time_stamp VARCHAR,
max_time INTERVAL,
output_cols VARCHAR
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, $6, True);
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(
source_table VARCHAR,
output_table VARCHAR,
partition_expr VARCHAR,
time_stamp VARCHAR,
max_time INTERVAL
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, '*', True);
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(message TEXT)
RETURNS text AS $$
PythonFunction(utilities, sessionize, sessionize_help_message)
$$ language plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize()
RETURNS text AS $$
SELECT MADLIB_SCHEMA.sessionize('');
$$ language SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');