blob: a95a86319fcb0f11696729227262c6c08a2e14b7 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Vector to Columns</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.21.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__vec2cols.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Vector to Columns<div class="ingroups"><a class="el" href="group__grp__other__functions.html">Utilities</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li class="level1">
<a href="#vec2cols_syntax">Syntax</a> </li>
<li class="level1">
<a href="#vec2cols_usage">Usage</a> </li>
<li class="level1">
<a href="#vec2cols_example">Examples</a> </li>
</ul>
</div><dl class="section user"><dt>About</dt><dd>Converts a feature array in a single column into multiple columns. This process can be used to reverse the function cols2vec.</dd></dl>
<p>Given a table with a column of type array, this function will create an output table that splits this array into multiple columns, one per array element. It includes the option to name the new feature columns, and to include columns from the original table in the output.</p>
<p><a class="anchor" id="vec2cols_usage"></a></p><dl class="section user"><dt>Usage</dt><dd></dd></dl>
<pre class="syntax">
vec2cols(
source_table,
output_table,
vector_col,
feature_names,
cols_to_output
)
</pre><p><b>Arguments</b> </p><dl class="arglist">
<dt>source_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the source data..</p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of the generated table containing the output. If a table with the same name already exists, an error will be returned.</p>
<p class="enddd"></p>
</dd>
<dt>vector_col </dt>
<dd><p class="startdd">TEXT. Name of the column containing the feature array. Must be a one-dimensional array.</p>
<p class="enddd"></p>
</dd>
<dt>feature_names (optional) </dt>
<dd>TEXT[]. Array of names associated with the feature array. Note that this array exists in the summary table created by the function 'cols2vec'. If the 'feature_names' array is not specified, column names will be automatically generated of the form 'f1, f2, ...fn'. <dl class="section note"><dt>Note</dt><dd>If you specify the 'feature_names' parameter, you will get exactly that number of feature columns in the 'output_table'. It means feature arrays from the 'vector_col' may be padded or truncated, if a particular feature array size does not match the target number of feature columns. <br />
<br />
If you do not specify the 'feature names' parameter, the number of feature columns generated in the 'output_table' will be the maximum array size from 'vector_col'. Feature arrays that are less than this maximum will be padded.</dd></dl>
</dd>
<dt>cols_to_output (optional) </dt>
<dd>TEXT, default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature columns. To keep all columns from the source table, use '*'. The total number of columns in a table cannot exceed the PostgreSQL limits. </dd>
</dl>
<p><b>Output table</b> <br />
The output table produced by the vec2cols function contains the following columns: </p><table class="output">
<tr>
<th>&lt;...&gt; </th><td>Columns from source table, depending on which ones are kept (if any). </td></tr>
<tr>
<th>feature columns </th><td>Columns for each of the features in 'vector_col'. Column type will depend on the feature array type in the source table. Column naming will depend on whether the parameter 'feature_names' is used. </td></tr>
</table>
<p><a class="anchor" id="vec2cols_example"></a></p><dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Load sample data: <pre class="example">
DROP TABLE IF EXISTS golf CASCADE;
CREATE TABLE golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
</pre></li>
<li>Split the column "clouds_airquality" into new columns called "clouds" and "air_quality". Also keep columns id and "OUTLOOK" from the source table <pre class="example">
DROP TABLE IF EXISTS vec2cols_result;
SELECT madlib.vec2cols(
'golf', -- source table
'vec2cols_result', -- output table
'clouds_airquality', -- column with array entries to split
ARRAY['clouds', 'air_quality'], -- feature names
'id, "OUTLOOK"' -- columns to keep from source table
);
SELECT * FROM vec2cols_result ORDER BY id;
</pre> <pre class="result">
id | OUTLOOK | clouds | air_quality
----+----------+--------+-------------
1 | sunny | none | unhealthy
2 | sunny | none | moderate
3 | overcast | low | moderate
4 | rain | low | moderate
5 | rain | medium | good
6 | rain | low | unhealthy
7 | overcast | medium | moderate
8 | sunny | high | unhealthy
9 | sunny | high | good
10 | rain | medium | good
11 | sunny | none | good
12 | overcast | medium | moderate
13 | overcast | medium | moderate
14 | rain | low | unhealthy
(14 rows)
</pre></li>
<li>Similar to the previous example, except now we keep all columns from source table and do not specify the feature names, so that default names are created. <pre class="example">
DROP TABLE IF EXISTS vec2cols_result;
SELECT madlib.vec2cols(
'golf', -- source table
'vec2cols_result', -- output table
'clouds_airquality', -- column with array entries to split
NULL, -- feature names
'*' -- columns to keep from source table
);
SELECT * FROM vec2cols_result ORDER BY id;
</pre> <pre class="result">
id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | f1 | f2
----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+--------+-----------
1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | none | unhealthy
2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | none | moderate
3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | low | moderate
4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | low | moderate
5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | medium | good
6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | low | unhealthy
7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | medium | moderate
8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | high | unhealthy
9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | high | good
10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | medium | good
11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | none | good
12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | medium | moderate
13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | medium | moderate
14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | low | unhealthy
(14 rows)
</pre></li>
<li>Now let's run cols2vec then reverse it using vec2cols. In this case we will get feature names from the cols2vec summary table. First run cols2vec: <pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'temperature, humidity',
NULL,
'id, temperature, humidity'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre> <pre class="result">
id | temperature | humidity | feature_vector
----+-------------+----------+----------------
1 | 85 | 85 | {85,85}
2 | 80 | 90 | {80,90}
3 | 83 | 78 | {83,78}
4 | 70 | 96 | {70,96}
5 | 68 | 80 | {68,80}
6 | 65 | 70 | {65,70}
7 | 64 | 65 | {64,65}
8 | 72 | 95 | {72,95}
9 | 69 | 70 | {69,70}
10 | 75 | 80 | {75,80}
11 | 75 | 70 | {75,70}
12 | 72 | 90 | {72,90}
13 | 81 | 75 | {81,75}
14 | 71 | 80 | {71,80}
(14 rows)
</pre> View the summary table with the feature_names dictionary: <pre class="example">
\x on
SELECT * FROM cols2vec_result_summary;
\x off
</pre> <pre class="result">
-[ RECORD 1 ]---------------+-----------------------
source_table | golf
list_of_features | temperature, humidity
list_of_features_to_exclude | None
feature_names | {temperature,humidity}
</pre> Now use feature_names from the summary table above to name the columns of the split array: <pre class="example">
DROP TABLE IF EXISTS vec2cols_result;
SELECT madlib.vec2cols(
'cols2vec_result', -- source table containing the feature vector
'vec2cols_result', -- output table
'feature_vector', -- column with array entries to split
(SELECT feature_names from cols2vec_result_summary), -- feature_names from summary table of cols2vec
'id' -- columns to keep from source table
);
SELECT * FROM vec2cols_result ORDER BY id;
</pre> <pre class="result">
id | temperature | humidity
----+-------------+----------
1 | 85 | 85
2 | 80 | 90
3 | 83 | 78
4 | 70 | 96
5 | 68 | 80
6 | 65 | 70
7 | 64 | 65
8 | 72 | 95
9 | 69 | 70
10 | 75 | 80
11 | 75 | 70
12 | 72 | 90
13 | 81 | 75
14 | 71 | 80
(14 rows)
</pre> This is the same as the format of the original 'golf' dataset that we started with. </li>
</ol>
</dd></dl>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Feb 23 2023 19:26:43 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>