blob: 1d0cd14ec75b4cdcac2652130945216901b95ed5 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Sessionize</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.19.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__sessionize.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Sessionize<div class="ingroups"><a class="el" href="group__grp__datatrans.html">Data Types and Transformations</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li>
<a href="#syntax">Function Syntax</a> </li>
<li>
<a href="#examples">Examples</a> </li>
<li>
<a href="#literature">Literature</a> </li>
</ul>
</div><p>The MADlib sessionize function performs time-oriented session reconstruction on a data set comprising a sequence of events. A defined period of inactivity indicates the end of one session and beginning of the next session. Sessions can be useful in many domains including web analytics [1], network security, manufacturing, finance, and operational analytics.</p>
<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function Syntax</dt><dd><pre class="syntax">
sessionize(
source_table,
output_table,
partition_expr,
time_stamp,
max_time,
output_cols,
create_view
)
</pre></dd></dl>
<p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">VARCHAR. Name of the source table that contains the data to be sessionized.</p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">VARCHAR. Name of the output view or table. (The parameter create_view described below defines whether the output is actually a view or a table.) In addition to the columns in the source_table, the output also contains a new column called session_id: </p><ul>
<li>
session_id=1,2,...n where n is the number of the session in the partition. </li>
</ul>
<p class="enddd"></p>
</dd>
<dt>partition_expr </dt>
<dd><p class="startdd">VARCHAR. The 'partition_expr' is a single column or a list of comma-separated columns/expressions to divide all rows into groups, or partitions. Sessionization is applied across the rows that fall into the same partition. This parameter can be set to NULL or '' to indicate the sessionization operation is to be applied to the whole input table.</p>
<p class="enddd"></p>
</dd>
<dt>time_stamp </dt>
<dd><p class="startdd">VARCHAR. The time stamp column name that is used for sessionization calculation. Note that the time_stamp column will be sorted in ascending order before the session reconstruction is done within a partition.</p>
<p class="enddd"></p>
</dd>
<dt>max_time </dt>
<dd><p class="startdd">INTERVAL. Maximum delta time (i.e., time out) between subsequent events that define a session. If the elapsed time between subsequent events is longer than max_time, a new session is created.</p>
<p class="enddd"><a class="anchor" id="note"></a></p><dl class="section note"><dt>Note</dt><dd>Note that max_time is of time type INTERVAL which is a PostgreSQL way of describing elapsed time. For more information on INTERVAL please refer to reference [2].</dd></dl>
</dd>
<dt>output_cols (optional) </dt>
<dd><p class="startdd">VARCHAR. An optional comma separated list of columns to be written to the output_table. Must be a valid SELECT expression. This is set to '*' by default, which means all columns in the input table will be written to the output_table plus the session_id column. Note that this parameter could include a list containing the partition_expr or any other expressions of interest. E.g., '*, expr1, expr2, etc.' where this means output all columns from the input table plus the expressions listed plus the session_id column.</p>
<p class="enddd"></p>
</dd>
<dt>create_view (optional) </dt>
<dd>BOOLEAN default: TRUE. Determines whether to create a view or materialize the output as a table. If you only need session info once, creating a view could be significantly faster than materializing as a table. Please note that if you set create_view to NULL (allowed by PostgreSQL) it will get set to the default value of TRUE. </dd>
</dl>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<p>The data set describes shopper behavior on a notional web site that sells beer and wine. A beacon fires an event to a log file when the shopper visits different pages on the site: landing page, beer selection page, wine selection page, and checkout. Each user is identified by a a user id, and every time a page is visited, the page and time stamp are logged.</p>
<p>Create the data table:</p>
<pre class="example">
DROP TABLE IF EXISTS eventlog CASCADE; -- Using CASCADE in case you are running through this example more than once (views used below)
CREATE TABLE eventlog (event_timestamp TIMESTAMP,
user_id INT,
page TEXT,
revenue FLOAT);
INSERT INTO eventlog VALUES
('04/15/2015 02:19:00', 101331, 'CHECKOUT', 16),
('04/15/2015 02:17:00', 202201, 'WINE', 0),
('04/15/2015 03:18:00', 202201, 'BEER', 0),
('04/15/2015 01:03:00', 100821, 'LANDING', 0),
('04/15/2015 01:04:00', 100821, 'WINE', 0),
('04/15/2015 01:05:00', 100821, 'CHECKOUT', 39),
('04/15/2015 02:06:00', 100821, 'WINE', 0),
('04/15/2015 02:09:00', 100821, 'WINE', 0),
('04/15/2015 02:15:00', 101331, 'LANDING', 0),
('04/15/2015 02:16:00', 101331, 'WINE', 0),
('04/15/2015 02:17:00', 101331, 'HELP', 0),
('04/15/2015 02:18:00', 101331, 'WINE', 0),
('04/15/2015 02:29:00', 201881, 'LANDING', 0),
('04/15/2015 02:30:00', 201881, 'BEER', 0),
('04/15/2015 01:05:00', 202201, 'LANDING', 0),
('04/15/2015 01:06:00', 202201, 'HELP', 0),
('04/15/2015 01:09:00', 202201, 'LANDING', 0),
('04/15/2015 02:15:00', 202201, 'WINE', 0),
('04/15/2015 02:16:00', 202201, 'BEER', 0),
('04/15/2015 03:19:00', 202201, 'WINE', 0),
('04/15/2015 03:22:00', 202201, 'CHECKOUT', 21);
</pre><p>Sessionize the table by each user_id: </p><pre class="example">
DROP VIEW IF EXISTS sessionize_output_view;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_view', -- View to store sessionize results
'user_id', -- Partition input table by user id
'event_timestamp', -- Time column used to compute sessions
'0:30:0' -- Use 30 minute time out to define sessions
);
SELECT * FROM sessionize_output_view ORDER BY user_id, event_timestamp;
</pre><p>Result: </p><pre class="result">
event_timestamp | user_id | page | revenue | session_id
---------------------+---------+----------+---------+------------
2015-04-15 01:03:00 | 100821 | LANDING | 0 | 1
2015-04-15 01:04:00 | 100821 | WINE | 0 | 1
2015-04-15 01:05:00 | 100821 | CHECKOUT | 39 | 1
2015-04-15 02:06:00 | 100821 | WINE | 0 | 2
2015-04-15 02:09:00 | 100821 | WINE | 0 | 2
2015-04-15 02:15:00 | 101331 | LANDING | 0 | 1
2015-04-15 02:16:00 | 101331 | WINE | 0 | 1
2015-04-15 02:17:00 | 101331 | HELP | 0 | 1
2015-04-15 02:18:00 | 101331 | WINE | 0 | 1
2015-04-15 02:19:00 | 101331 | CHECKOUT | 16 | 1
2015-04-15 02:29:00 | 201881 | LANDING | 0 | 1
2015-04-15 02:30:00 | 201881 | BEER | 0 | 1
2015-04-15 01:05:00 | 202201 | LANDING | 0 | 1
2015-04-15 01:06:00 | 202201 | HELP | 0 | 1
2015-04-15 01:09:00 | 202201 | LANDING | 0 | 1
2015-04-15 02:15:00 | 202201 | WINE | 0 | 2
2015-04-15 02:16:00 | 202201 | BEER | 0 | 2
2015-04-15 02:17:00 | 202201 | WINE | 0 | 2
2015-04-15 03:18:00 | 202201 | BEER | 0 | 3
2015-04-15 03:19:00 | 202201 | WINE | 0 | 3
2015-04-15 03:22:00 | 202201 | CHECKOUT | 21 | 3
(21 rows)
</pre><p>Now let's say we want to see 3 minute sessions by a group of users with a certain range of user IDs. To do this, we need to sessionize the table based on a partition expression. Also, we want to persist a table output with a reduced set of columns in the table. </p><pre class="example">
DROP TABLE IF EXISTS sessionize_output_table;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_table', -- Table to store sessionize results
'user_id &lt; 200000', -- Partition input table by subset of users
'event_timestamp', -- Order partitions in input table by time
'180', -- Use 180 second time out to define sessions (same as '0:03:0')
'event_timestamp, user_id, user_id &lt; 200000 AS "Department-A1"', -- Select only user_id and event_timestamp columns, along with the session id as output
'f' -- create a table
);
SELECT * FROM sessionize_output_table WHERE "Department-A1"='TRUE' ORDER BY event_timestamp;
</pre><p>Result showing 2 users and 3 total sessions across the group: </p><pre class="result">
event_timestamp | user_id | Department-A1 | session_id
---------------------+---------+---------------+------------
2015-04-15 01:03:00 | 100821 | t | 1
2015-04-15 01:04:00 | 100821 | t | 1
2015-04-15 01:05:00 | 100821 | t | 1
2015-04-15 02:06:00 | 100821 | t | 2
2015-04-15 02:09:00 | 100821 | t | 2
2015-04-15 02:15:00 | 101331 | t | 3
2015-04-15 02:16:00 | 101331 | t | 3
2015-04-15 02:17:00 | 101331 | t | 3
2015-04-15 02:18:00 | 101331 | t | 3
2015-04-15 02:19:00 | 101331 | t | 3
(10 rows)
</pre><p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>NOTE: The following PostgreSQL link refers to documentation resources for the current PostgreSQL database version. Depending upon your database platform version, you may need to change "current" reference in the link to your database version.</p>
<p>If your database platform uses the Greenplum Database (or related variants), please check with the project community and/or your database vendor to identify the PostgreSQL version it is based on.</p>
<p>[1] Sesssions in web analytics <a href="https://en.wikipedia.org/wiki/Session_(web_analytics)">https://en.wikipedia.org/wiki/Session_(web_analytics)</a></p>
<p>[2] PostgreSQL date/time types <a href="https://www.postgresql.org/docs/current/static/datatype-datetime.html">https://www.postgresql.org/docs/current/static/datatype-datetime.html</a> </p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Wed Dec 15 2021 20:27:19 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>