blob: 3fdbe704b1814df9a9646c28303e4c8d35ba5309 [file] [log] [blame]
/* ----------------------------------------------------------------------- */
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file cols2vec.sql_in
* @brief A set of utilities to ease basic table transformations
* (such as *aggregating multiple columns in an array)
* @date July 2018
*
*/
/* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_cols2vec
@brief Create a new table with all feature columns inserted into a single column as an array
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#cols2vec_syntax">Syntax</a>
<li class="level1"><a href="#cols2vec_usage">Usage</a>
<li class="level1"><a href="#cols2vec_example">Examples</a>
</ul>
</div>
@about
Convert feature columns in a table into an array in a single column.
Given a table with a number of feature columns, this function will create an
output table that contains the feature columns in an array. A summary
table will also be created that contains the names of the features combined into
array, so that this process can be reversed using the function vec2cols.
@anchor cols2vec_usage
@usage
<pre class="syntax">
cols2vec(
source_table,
output_table,
list_of_features,
list_of_features_to_exclude,
cols_to_output
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the source data.</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the generated table containing the output.</dd>
<dt>list_of_features</dt>
<dd>TEXT.
Comma-separated string of column names or expressions to put into feature array.
Can also be '*' implying all columns are to be put into feature array (except
for the ones included in the next argument that lists exclusions). Type casting
will be done as per the regular type casting rules of the underlying database.
Array columns in the source table are not supported in the 'list_of_features' parameter. Also,
all of the features to be included must be of the same type and must not have null
values.</dd>
<dt>list_of_features_to_exclude (optional)</dt>
<dd>TEXT. Default NULL.
Comma-separated string of column names to exclude from the feature array. Typically used
when 'list_of_features' is set to '*'.</dd>
<dt>cols_to_output (optional)</dt>
<dd>TEXT. Default NULL.
Comma-separated string of column names from the source table to keep in the
output table, in addition to the feature array. To keep all columns from the
source table, use '*' for this parameter. </dd>
</dl>
<b>Output table</b>
<br>
The output table produced by the cols2vec function contains the following columns:
<table class="output">
<tr>
<th><...></th>
<td>Columns from source table, depending on which ones are kept (if any).
</td>
</tr>
<tr>
<th>feature_vector</th>
<td>Column that contains the feature array.
</tr>
</table>
<b>Output summary table</b>
<br>
A summary table named <em><output_table>_summary</em> is also created
that contains:
<table class="output">
<tr>
<th>source_table</th>
<td>Name of the table containing the source data.</td>
</tr>
<tr>
<th>list_of_features</th>
<td>List of features to put in vector.</td>
</tr>
<tr>
<th>list_of_features_to_exclude</th>
<td>Features specified by the user to exclude from 'list_of_features'.</td>
</tr>
<tr>
<th>feature_names</th>
<td>Names of the features that were nested (converted to a vector) in the output table.</td>
</tr>
</table>
@anchor cols2vec_example
@par Examples
-# Load sample data:
<pre class="example">
DROP TABLE IF EXISTS golf CASCADE;
CREATE TABLE golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
</pre>
-# Run cols2vec to combine the temperature and humidity columns into a single array feature.
<pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'temperature, humidity'
);
SELECT * FROM cols2vec_result;
</pre>
<pre class="result">
feature_vector
----------------+
{85,85}
{80,90}
{83,78}
{70,96}
{68,80}
{65,70}
{64,65}
{72,95}
{69,70}
{75,80}
{75,70}
{72,90}
{81,75}
{71,80}
(14 rows)
</pre>
View the summary table:
<pre class="example">
\\x on
SELECT * FROM cols2vec_result_summary;
\\x off
</pre>
<pre class="result">
-[ RECORD 1 ]---------------+----------------------------------------------------------------
source_table | golf
list_of_features | temperature, humidity
list_of_features_to_exclude | None
feature_names | {temperature,humidity}
</pre>
-# Combine the temperature and humidity columns
and keep 2 other columns from source_table.
<pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'temperature, humidity',
NULL,
'id, "OUTLOOK"'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre>
<pre class="result">
id | OUTLOOK | feature_vector
----+----------+----------------
1 | sunny | {85,85}
2 | sunny | {80,90}
3 | overcast | {83,78}
4 | rain | {70,96}
5 | rain | {68,80}
6 | rain | {65,70}
7 | overcast | {64,65}
8 | sunny | {72,95}
9 | sunny | {69,70}
10 | rain | {75,80}
11 | sunny | {75,70}
12 | overcast | {72,90}
13 | overcast | {81,75}
14 | rain | {71,80}
(14 rows)
</pre>
View the summary table:
<pre class="example">
\\x on
SELECT * FROM cols2vec_result_summary;
\\x off
</pre>
<pre class="result">
-[ RECORD 1 ]---------------+----------------------------------------------------------------
source_table | golf
list_of_features | temperature, humidity
list_of_features_to_exclude | None
feature_names | {temperature,humidity}
</pre>
-# Combine all columns, excluding all columns that are not of type double precision.
<pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'*',
'"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id',
'id, "OUTLOOK"'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre>
<pre class="result">
id | OUTLOOK | feature_vector
----+----------+----------------
1 | sunny | {85,85,5}
2 | sunny | {80,90,5}
3 | overcast | {83,78,1.5}
4 | rain | {70,96,1}
5 | rain | {68,80,1}
6 | rain | {65,70,1}
7 | overcast | {64,65,1.5}
8 | sunny | {72,95,5}
9 | sunny | {69,70,5}
10 | rain | {75,80,1}
11 | sunny | {75,70,5}
12 | overcast | {72,90,1.5}
13 | overcast | {81,75,1.5}
14 | rain | {71,80,1}
(14 rows)
</pre>
View summary table:
<pre class="example">
\\x on
SELECT * FROM cols2vec_result_summary;
\\x off
</pre>
<pre class="result">
-[ RECORD 1 ]---------------+----------------------------------------------------------------
source_table | golf
list_of_features | *
list_of_features_to_exclude | "OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id
feature_names | {temperature,humidity,observation_weight}
</pre>
-# Combine the temperature and humidity columns, exclude windy, and keep all of the
columns from the source table.
<pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'windy, temperature, humidity',
'windy',
'*'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre>
<pre class="result">
id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | feature_vector
----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+----------------
1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | {85,85}
2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | {80,90}
3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | {83,78}
4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | {70,96}
5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | {68,80}
6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | {65,70}
7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | {64,65}
8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | {72,95}
9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | {69,70}
10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | {75,80}
11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | {75,70}
12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | {72,90}
13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | {81,75}
14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | {71,80}
(14 rows)
</pre>
View the summary table:
<pre class="example">
\\x on
SELECT * FROM cols2vec_result_summary;
\\x off
</pre>
<pre class="result">
-[ RECORD 1 ]---------------+-----------------------------
source_table | golf
list_of_features | windy, temperature, humidity
list_of_features_to_exclude | windy
feature_names | {temperature,humidity}
</pre>
This also shows that you can exclude features in 'list_of_features_to_exclude'
that are in the list of 'list_of_features'. This can be useful if the 'list_of_features'
is generated from an expression or subquery.
-# Type casting works as per regular rules of the underlying database.
E.g, combining integer and double precisions columns will create a double precision feature vector.
For Boolean, do an explicit cast to the target type:
<pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'windy::TEXT, class',
NULL,
'id'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre>
<pre class="result">
id | feature_vector
----+----------------------
1 | {false,"Don't Play"}
2 | {true,"Don't Play"}
3 | {false,Play}
4 | {false,Play}
5 | {false,Play}
6 | {true,"Don't Play"}
7 | {true,Play}
8 | {false,"Don't Play"}
9 | {false,Play}
10 | {false,Play}
11 | {true,Play}
12 | {true,Play}
13 | {false,Play}
14 | {true,"Don't Play"}
(14 rows)
</pre>
*/
-------------------------------------------------------------------------
-- cols2vec
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
source_table VARCHAR,
output_table VARCHAR,
list_of_features VARCHAR,
list_of_features_to_exclude VARCHAR,
cols_to_output VARCHAR
) RETURNS void AS $$
PythonFunctionBodyOnly(utilities, transform_vec_cols)
from utilities.control import MinWarning
with MinWarning('warning'):
cols2vec_obj = transform_vec_cols.cols2vec()
return cols2vec_obj.cols2vec(**globals())
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
source_table VARCHAR,
output_table VARCHAR,
list_of_features VARCHAR,
list_of_features_to_exclude VARCHAR
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, $4, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
source_table VARCHAR,
output_table VARCHAR,
list_of_features VARCHAR
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(message TEXT)
RETURNS text AS $$
PythonFunctionBodyOnly(utilities, transform_vec_cols)
from utilities.control import MinWarning
with MinWarning('warning'):
return transform_vec_cols.cols2vec().cols2vec_help_message(schema_madlib, message)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec()
RETURNS text AS $$
SELECT MADLIB_SCHEMA.cols2vec('');
$$ language SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');