blob: 1597ebd84c2cb6bbafbac90ffae255126a9a76c2 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Columns to Vector</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.21.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__cols2vec.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Columns to Vector<div class="ingroups"><a class="el" href="group__grp__other__functions.html">Utilities</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li class="level1">
<a href="#cols2vec_syntax">Syntax</a> </li>
<li class="level1">
<a href="#cols2vec_usage">Usage</a> </li>
<li class="level1">
<a href="#cols2vec_example">Examples</a> </li>
</ul>
</div><dl class="section user"><dt>About</dt><dd>Convert feature columns in a table into an array in a single column.</dd></dl>
<p>Given a table with a number of feature columns, this function will create an output table that contains the feature columns in an array. A summary table will also be created that contains the names of the features combined into array, so that this process can be reversed using the function vec2cols.</p>
<p><a class="anchor" id="cols2vec_usage"></a></p><dl class="section user"><dt>Usage</dt><dd></dd></dl>
<pre class="syntax">
cols2vec(
source_table,
output_table,
list_of_features,
list_of_features_to_exclude,
cols_to_output
)
</pre><p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the source data.</p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of the generated table containing the output.</p>
<p class="enddd"></p>
</dd>
<dt>list_of_features </dt>
<dd><p class="startdd">TEXT. Comma-separated string of column names or expressions to put into feature array. Can also be '*' implying all columns are to be put into feature array (except for the ones included in the next argument that lists exclusions). Type casting will be done as per the regular type casting rules of the underlying database. Array columns in the source table are not supported in the 'list_of_features' parameter. Also, all of the features to be included must be of the same type and must not have null values.</p>
<p class="enddd"></p>
</dd>
<dt>list_of_features_to_exclude (optional) </dt>
<dd><p class="startdd">TEXT. Default NULL. Comma-separated string of column names to exclude from the feature array. Typically used when 'list_of_features' is set to '*'.</p>
<p class="enddd"></p>
</dd>
<dt>cols_to_output (optional) </dt>
<dd>TEXT. Default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature array. To keep all columns from the source table, use '*' for this parameter. </dd>
</dl>
<p><b>Output table</b> <br />
The output table produced by the cols2vec function contains the following columns: </p><table class="output">
<tr>
<th>&lt;...&gt; </th><td>Columns from source table, depending on which ones are kept (if any). </td></tr>
<tr>
<th>feature_vector </th><td>Column that contains the feature array. </td></tr>
</table>
<p><b>Output summary table</b> <br />
A summary table named <em>&lt;output_table&gt;_summary</em> is also created that contains: </p><table class="output">
<tr>
<th>source_table </th><td>Name of the table containing the source data. </td></tr>
<tr>
<th>list_of_features </th><td>List of features to put in vector. </td></tr>
<tr>
<th>list_of_features_to_exclude </th><td>Features specified by the user to exclude from 'list_of_features'. </td></tr>
<tr>
<th>feature_names </th><td>Names of the features that were nested (converted to a vector) in the output table. </td></tr>
</table>
<p><a class="anchor" id="cols2vec_example"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<ol type="1">
<li>Load sample data: <pre class="example">
DROP TABLE IF EXISTS golf CASCADE;
CREATE TABLE golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
</pre></li>
<li>Run cols2vec to combine the temperature and humidity columns into a single array feature. <pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'temperature, humidity'
);
SELECT * FROM cols2vec_result;
</pre> <pre class="result">
feature_vector
----------------+
{85,85}
{80,90}
{83,78}
{70,96}
{68,80}
{65,70}
{64,65}
{72,95}
{69,70}
{75,80}
{75,70}
{72,90}
{81,75}
{71,80}
(14 rows)
</pre> View the summary table: <pre class="example">
\x on
SELECT * FROM cols2vec_result_summary;
\x off
</pre> <pre class="result">
-[ RECORD 1 ]---------------+----------------------------------------------------------------
source_table | golf
list_of_features | temperature, humidity
list_of_features_to_exclude | None
feature_names | {temperature,humidity}
</pre></li>
<li>Combine the temperature and humidity columns and keep 2 other columns from source_table. <pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'temperature, humidity',
NULL,
'id, "OUTLOOK"'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre> <pre class="result">
id | OUTLOOK | feature_vector
----+----------+----------------
1 | sunny | {85,85}
2 | sunny | {80,90}
3 | overcast | {83,78}
4 | rain | {70,96}
5 | rain | {68,80}
6 | rain | {65,70}
7 | overcast | {64,65}
8 | sunny | {72,95}
9 | sunny | {69,70}
10 | rain | {75,80}
11 | sunny | {75,70}
12 | overcast | {72,90}
13 | overcast | {81,75}
14 | rain | {71,80}
(14 rows)
</pre> View the summary table: <pre class="example">
\x on
SELECT * FROM cols2vec_result_summary;
\x off
</pre> <pre class="result">
-[ RECORD 1 ]---------------+----------------------------------------------------------------
source_table | golf
list_of_features | temperature, humidity
list_of_features_to_exclude | None
feature_names | {temperature,humidity}
</pre></li>
<li>Combine all columns, excluding all columns that are not of type double precision. <pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'*',
'"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id',
'id, "OUTLOOK"'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre> <pre class="result">
id | OUTLOOK | feature_vector
----+----------+----------------
1 | sunny | {85,85,5}
2 | sunny | {80,90,5}
3 | overcast | {83,78,1.5}
4 | rain | {70,96,1}
5 | rain | {68,80,1}
6 | rain | {65,70,1}
7 | overcast | {64,65,1.5}
8 | sunny | {72,95,5}
9 | sunny | {69,70,5}
10 | rain | {75,80,1}
11 | sunny | {75,70,5}
12 | overcast | {72,90,1.5}
13 | overcast | {81,75,1.5}
14 | rain | {71,80,1}
(14 rows)
</pre> View summary table: <pre class="example">
\x on
SELECT * FROM cols2vec_result_summary;
\x off
</pre> <pre class="result">
-[ RECORD 1 ]---------------+----------------------------------------------------------------
source_table | golf
list_of_features | *
list_of_features_to_exclude | "OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id
feature_names | {temperature,humidity,observation_weight}
</pre></li>
<li>Combine the temperature and humidity columns, exclude windy, and keep all of the columns from the source table. <pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'windy, temperature, humidity',
'windy',
'*'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre> <pre class="result">
id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | feature_vector
----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+----------------
1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | {85,85}
2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | {80,90}
3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | {83,78}
4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | {70,96}
5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | {68,80}
6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | {65,70}
7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | {64,65}
8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | {72,95}
9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | {69,70}
10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | {75,80}
11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | {75,70}
12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | {72,90}
13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | {81,75}
14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | {71,80}
(14 rows)
</pre> View the summary table: <pre class="example">
\x on
SELECT * FROM cols2vec_result_summary;
\x off
</pre> <pre class="result">
-[ RECORD 1 ]---------------+-----------------------------
source_table | golf
list_of_features | windy, temperature, humidity
list_of_features_to_exclude | windy
feature_names | {temperature,humidity}
</pre> This also shows that you can exclude features in 'list_of_features_to_exclude' that are in the list of 'list_of_features'. This can be useful if the 'list_of_features' is generated from an expression or subquery.</li>
<li>Type casting works as per regular rules of the underlying database. E.g, combining integer and double precisions columns will create a double precision feature vector. For Boolean, do an explicit cast to the target type: <pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'windy::TEXT, class',
NULL,
'id'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre> <pre class="result">
id | feature_vector
-&mdash;+-------------------&mdash;
1 | {false,"Don't Play"}
2 | {true,"Don't Play"}
3 | {false,Play}
4 | {false,Play}
5 | {false,Play}
6 | {true,"Don't Play"}
7 | {true,Play}
8 | {false,"Don't Play"}
9 | {false,Play}
10 | {false,Play}
11 | {true,Play}
12 | {true,Play}
13 | {false,Play}
14 | {true,"Don't Play"}
(14 rows)
</pre> </li>
</ol>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Feb 23 2023 19:26:43 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>