blob: dee1dbbac4236a931edf7cebaf4102c68580fffc [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.10"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Grp_path</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
$(window).load(resizeHeight);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script src="../mathjax/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="navtree_hack.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.8</span>
</div>
<div id="projectbrief">User Documentation for MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.10 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__path.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Grp_path</div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> </p><ul>
<li>
<a href="#syntax">Function Syntax</a> </li>
<li>
<a href="#examples">Examples</a> </li>
</ul>
</div><p>The goal of the MADlib path function is to perform regular pattern matching over a sequence of rows, and to extract useful information about the matches. The useful information could be a simple count of matches or something more involved like aggregation.</p>
<p>There are many use cases where path functions are typically used:</p><ul>
<li>Web analytics (clickstream)</li>
<li>Marketing revenue attribution</li>
<li>Telephone calling patterns</li>
<li>Stock market trading sequences</li>
<li>Predictive maintenance</li>
<li>Genomics sequencing</li>
</ul>
<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Function Syntax</dt><dd><pre class="syntax">
path(
source_table,
output_table,
partition_expr,
order_expr,
pattern,
symbol,
aggregate_func,
persist_rows
)
</pre></dd></dl>
<p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">VARCHAR. Name of the source table, containing data for path analysis.</p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">VARCHAR. Name of the result table. This table contains columns predicated by the <em>result</em> argument (shown below).</p>
<p class="enddd"></p>
</dd>
<dt>partition_expr </dt>
<dd><p class="startdd">VARCHAR. The 'partition_expr' can be a list of columns or expressions (separated by comma) to divide all rows into groups, or partitions, that share the same values of the partition expression(s). For each row, the matching is applied across the rows that fall into the same partition. This can be NULL or '' to indicate the matching is to be applied on the whole table.</p>
<p class="enddd"></p>
</dd>
<dt>order_expr </dt>
<dd><p class="startdd">VARCHAR. This expression controls the order in which rows are processed or matched in a partition. </p>
<p class="enddd"></p>
</dd>
<dt>pattern </dt>
<dd><p class="startdd">VARCHAR. The PATTERN clause defines the pattern that path searches for. You express the PATTERN using symbols and operators. Symbols defined using more than 1 characters need to be wrapped in parentheses '()'. The following pattern-matching metacharacters are supported: </p><ul>
<li>
| denotes alternation (either of two alternatives). </li>
<li>
? denotes repetition of the previous item zero or one time. </li>
<li>
* denotes repetition of the previous item zero or more times. </li>
<li>
+ denotes repetition of the previous item one or more times. </li>
<li>
{m} denotes repetition of the previous item exactly m times. </li>
<li>
{m,} denotes repetition of the previous item m or more times. </li>
<li>
{m,n} denotes repetition of the previous item at least m and not more than n times. </li>
<li>
Parentheses () can be used to group items into a single logical item. </li>
</ul>
<p class="enddd"></p>
</dd>
<dt>symbol </dt>
<dd><p class="startdd">VARCHAR. A symbol represents a row of a particular type that you’re searching for as part of a row sequence. In the SYMBOLS clause, you write a predicate to define the type of row that matches the symbol. </p>
<p class="enddd"></p>
</dd>
<dt>aggregate_func </dt>
<dd><p class="startdd">VARCHAR. A comma-separated list of window functions and aggregates to be applied on the matched window. </p>
<p class="enddd"></p>
</dd>
<dt>persist_rows </dt>
<dd><p class="startdd">BOOLEAN. If TRUE the matched rows are persisted in another table. This table is named as &lt;output_table&gt;_tuples (the string "_tuples" is added as suffix to the value of <em>output_table</em>). </p>
<p class="enddd"></p>
</dd>
</dl>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<ul>
<li>Build sample dataset <pre class="example">
CREATE TABLE data (id integer, sessionid integer, starttime timestamp, topic varchar, portfolio integer);
COPY data FROM STDIN DELIMITER AS '|';
1|1|1/01/00 5:00 AM|Real-time Equity pricing|769
1|2|1/01/00 5:30 AM|Real-time Index pricing|9898
1|3|1/01/00 6:00 AM|Real-time Index pricing|9898
1|4|1/01/00 5:00 AM|Calendar|98977
1|5|1/01/00 10:00 AM|Real-time Equity pricing|769
1|6|1/01/00 10:30 AM|Real-time Equity pricing|9898
1|7|1/01/00 12:00 PM|Calendar|1325
1|8|1/01/00 12:00 PM|Calendar|6777
1|9|1/01/00 4:05 PM|Real-time Equity pricing|769
1|10|1/01/00 4:10 PM|Report Dates and Analyst Recommendations|34346
1|11|1/01/00 4:15 PM|Real-time Index pricing|9898
1|12|1/01/00 4:20 PM|Snapshot Futures pricing|5568
1|13|1/01/00 4:21 PM|Real-time Equity pricing|769
1|14|1/01/00 4:25 PM|Bond security master|789
1|15|1/01/00 10:00 PM|Bond pricing|55
1|16|1/01/00 10:00 PM|Bond pricing|55
1|17|1/01/00 10:00 PM|Bond pricing|55
1|18|1/01/00 10:00 PM|Bond Yield Analysis|655
1|19|1/01/00 10:00 PM|Bond pricing|55
1|20|1/01/00 10:00 PM|Bond pricing|55
2|1|1/04/00 12:00 AM|Real-time Equity pricing|769
2|2|1/04/00 12:00 AM|Real-time Equity pricing|234234
2|3|1/04/00 12:00 AM|Calendar|1325
2|4|1/04/00 12:00 AM|Calendar|6777
2|5|1/04/00 12:00 AM|Real-time Equity pricing|769
2|6|1/04/00 12:00 AM|Report Dates and Analyst Recommendations|34346
2|7|1/04/00 12:00 AM|Real-time Index pricing|9898
2|8|1/04/00 12:00 AM|Snapshot Futures pricing|5568
2|9|1/04/00 12:00 AM|Real-time Equity pricing|769
2|10|1/04/00 12:00 AM|Bond security master|789
2|11|1/04/00 12:00 AM|Bond pricing|55
2|12|1/04/00 12:00 AM|Bond pricing|55
2|13|1/04/00 12:00 AM|Bond pricing|55
2|14|1/04/00 12:00 AM|Bond pricing|55
\.
&#160;
CREATE TABLE trades AS
SELECT *, starttimestamp::date startdate,
CASE WHEN THEN 'before'
WHEN THEN 'market'
WHEN THEN 'close'
WHEN THEN 'after'
END tradingperiod
FROM sessiontable
</pre></li>
<li>Compare within each day for the same Topic/Portfolio across every user <pre class="example">
SELECT madlib.path(
'trades',
'trades_out',
'startdate, topic, portfolio' -- each day of activity is looked at independently
'starttime' -- order by time
'BEFORE*.MARKET+.CLOSE+.AFTER*' -- at least one event during each of MARKET and CLOSE, but gather up the rest
'BEFORE:=starttimestamp::time &gt;= ''0:00:00'' and starttimestamp::time &lt; ''9:30:00''::time,
MARKET:=starttimestamp::time &gt;= ''9:30:00'' and starttimestamp::time &lt; ''16:00:00''::time,
CLOSE:= starttimestamp::time &lt;= ''16:00:00'' and starttimestamp::time &lt; ''16:30:00''::time,
AFTER:= starttimestamp::time &lt;= ''16:30:00'' and starttimestamp::time &lt; ''24:00:00''::time
',
'first(startdate) as starttime, array_agg(id) as all_users, count(*) as num_matches'
)
</pre> </li>
</ul>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Tue Mar 22 2016 17:55:59 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li>
</ul>
</div>
</body>
</html>