blob: a466d2463eafe7cf9e0b9aff547b9668076508a0 [file] [log] [blame]
/* ----------------------------------------------------------------------- */
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @file sessionize.sql_in
*
* @brief SQL functions for sessionization functions
* @date May 2016
*
*/
/* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_vec2cols
@brief Converts a feature array in a single column of an output table into multiple columns.
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#vec2cols_syntax">Syntax</a>
<li class="level1"><a href="#vec2cols_usage">Usage</a>
<li class="level1"><a href="#vec2cols_example">Examples</a>
</ul>
</div>
@about
Converts a feature array in a single column into multiple columns.
This process can be used to reverse the function cols2vec.
Given a table with a column of type array, this function will create an output
table that splits this array into multiple columns, one per array element.
It includes the option to name the new feature columns, and to include
columns from the original table in the output.
@anchor vec2cols_usage
@usage
<pre class="syntax">
vec2cols(
source_table,
output_table,
vector_col,
feature_names,
cols_to_output
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the source data.</tt>.
<dt>output_table</dt>
<dd>TEXT. Name of the generated table containing the output. If a table with the
same name already exists, an error will be returned.</tt>
<dt>vector_col</dt>
<dd>TEXT. Name of the column containing the feature array. Must be a one-dimensional array.</tt>
<dt>feature_names (optional)</dt>
<dd>TEXT[]. Array of names associated with the feature array. Note that
this array exists in the summary table created by the function 'cols2vec'. If
the 'feature_names' array is not specified,
column names will be automatically generated of
the form 'f1, f2, ...fn'.</tt>
@note If you specify the 'feature_names' parameter, you will get exactly that number of
feature columns in the 'output_table'. It means feature arrays from the 'vector_col' may be
padded or truncated, if a particular feature array size does not match the target
number of feature columns. <br><br>If you do not specify the 'feature names' parameter,
the number of feature columns generated
in the 'output_table' will be the maximum array size from 'vector_col'.
Feature arrays that are less than this maximum will be padded.
<dt>cols_to_output (optional)</dt>
<dd>TEXT, default NULL. Comma-separated string of column names
from the source table to keep in the
output table, in addition to the feature columns.
To keep all columns from the source table, use '*'.
The total number of columns in a table cannot exceed the
PostgreSQL limits.</tt>
</dd>
</dl>
<b>Output table</b>
<br>
The output table produced by the vec2cols function contains the following columns:
<table class="output">
<tr>
<th><...></th>
<td>Columns from source table, depending on which ones are kept (if any).
</td>
</tr>
<tr>
<th>feature columns</th>
<td>Columns for each of the features in 'vector_col'. Column type
will depend on the feature array type in the source table. Column
naming will depend on whether the parameter 'feature_names' is used.
</tr>
</table>
@anchor vec2cols_example
@par Examples
-# Load sample data:
<pre class="example">
DROP TABLE IF EXISTS golf CASCADE;
CREATE TABLE golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
</pre>
-# Split the column "clouds_airquality" into new columns
called "clouds" and "air_quality". Also keep columns id
and "OUTLOOK" from the source table
<pre class="example">
DROP TABLE IF EXISTS vec2cols_result;
SELECT madlib.vec2cols(
'golf', -- source table
'vec2cols_result', -- output table
'clouds_airquality', -- column with array entries to split
ARRAY['clouds', 'air_quality'], -- feature names
'id, "OUTLOOK"' -- columns to keep from source table
);
SELECT * FROM vec2cols_result ORDER BY id;
</pre>
<pre class="result">
id | OUTLOOK | clouds | air_quality
----+----------+--------+-------------
1 | sunny | none | unhealthy
2 | sunny | none | moderate
3 | overcast | low | moderate
4 | rain | low | moderate
5 | rain | medium | good
6 | rain | low | unhealthy
7 | overcast | medium | moderate
8 | sunny | high | unhealthy
9 | sunny | high | good
10 | rain | medium | good
11 | sunny | none | good
12 | overcast | medium | moderate
13 | overcast | medium | moderate
14 | rain | low | unhealthy
(14 rows)
</pre>
-# Similar to the previous example, except now
we keep all columns from source table and do not
specify the feature names, so that default names
are created.
<pre class="example">
DROP TABLE IF EXISTS vec2cols_result;
SELECT madlib.vec2cols(
'golf', -- source table
'vec2cols_result', -- output table
'clouds_airquality', -- column with array entries to split
NULL, -- feature names
'*' -- columns to keep from source table
);
SELECT * FROM vec2cols_result ORDER BY id;
</pre>
<pre class="result">
id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | f1 | f2
----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+--------+-----------
1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | none | unhealthy
2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | none | moderate
3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | low | moderate
4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | low | moderate
5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | medium | good
6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | low | unhealthy
7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | medium | moderate
8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | high | unhealthy
9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | high | good
10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | medium | good
11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | none | good
12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | medium | moderate
13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | medium | moderate
14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | low | unhealthy
(14 rows)
</pre>
-# Now let's run cols2vec then reverse it using vec2cols.
In this case we will get feature names from the cols2vec summary table.
First run cols2vec:
<pre class="example">
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT madlib.cols2vec(
'golf',
'cols2vec_result',
'temperature, humidity',
NULL,
'id, temperature, humidity'
);
SELECT * FROM cols2vec_result ORDER BY id;
</pre>
<pre class="result">
id | temperature | humidity | feature_vector
----+-------------+----------+----------------
1 | 85 | 85 | {85,85}
2 | 80 | 90 | {80,90}
3 | 83 | 78 | {83,78}
4 | 70 | 96 | {70,96}
5 | 68 | 80 | {68,80}
6 | 65 | 70 | {65,70}
7 | 64 | 65 | {64,65}
8 | 72 | 95 | {72,95}
9 | 69 | 70 | {69,70}
10 | 75 | 80 | {75,80}
11 | 75 | 70 | {75,70}
12 | 72 | 90 | {72,90}
13 | 81 | 75 | {81,75}
14 | 71 | 80 | {71,80}
(14 rows)
</pre>
View the summary table with the feature_names dictionary:
<pre class="example">
\\x on
SELECT * FROM cols2vec_result_summary;
\\x off
</pre>
<pre class="result">
-[ RECORD 1 ]---------------+-----------------------
source_table | golf
list_of_features | temperature, humidity
list_of_features_to_exclude | None
feature_names | {temperature,humidity}
</pre>
Now use feature_names from the summary table above
to name the columns of the split array:
<pre class="example">
DROP TABLE IF EXISTS vec2cols_result;
SELECT madlib.vec2cols(
'cols2vec_result', -- source table containing the feature vector
'vec2cols_result', -- output table
'feature_vector', -- column with array entries to split
(SELECT feature_names from cols2vec_result_summary), -- feature_names from summary table of cols2vec
'id' -- columns to keep from source table
);
SELECT * FROM vec2cols_result ORDER BY id;
</pre>
<pre class="result">
id | temperature | humidity
----+-------------+----------
1 | 85 | 85
2 | 80 | 90
3 | 83 | 78
4 | 70 | 96
5 | 68 | 80
6 | 65 | 70
7 | 64 | 65
8 | 72 | 95
9 | 69 | 70
10 | 75 | 80
11 | 75 | 70
12 | 72 | 90
13 | 81 | 75
14 | 71 | 80
(14 rows)
</pre>
This is the same as the format of the original 'golf' dataset
that we started with.
*/
-------------------------------------------------------------------------
-- vec2cols
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols(
source_table VARCHAR,
output_table VARCHAR,
vector_col VARCHAR,
feature_names TEXT[],
cols_to_output VARCHAR
) RETURNS void AS $$
PythonFunctionBodyOnly(utilities, transform_vec_cols)
from utilities.control import MinWarning
with MinWarning('warning'):
vec2cols_obj = transform_vec_cols.vec2cols()
return vec2cols_obj.vec2cols(**globals())
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols(
source_table VARCHAR,
output_table VARCHAR,
vector_col VARCHAR,
feature_names TEXT[]
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.vec2cols($1,$2,$3,$4,NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols(
source_table VARCHAR,
output_table VARCHAR,
vector_col VARCHAR
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.vec2cols($1,$2,$3,NULL,NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols(message TEXT)
RETURNS text AS $$
PythonFunctionBodyOnly(utilities, transform_vec_cols)
return transform_vec_cols.vec2cols().vec2cols_help_message(schema_madlib, message)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols()
RETURNS text AS $$
SELECT MADLIB_SCHEMA.vec2cols('');
$$ language SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');