| /* ----------------------------------------------------------------------- */ |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file cols2vec.sql_in |
| * @brief A set of utilities to ease basic table transformations |
| * (such as *aggregating multiple columns in an array) |
| * @date July 2018 |
| * |
| */ |
| /* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_cols2vec |
| |
| @brief Create a new table with all feature columns inserted into a single column as an array |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#cols2vec_syntax">Syntax</a> |
| <li class="level1"><a href="#cols2vec_usage">Usage</a> |
| <li class="level1"><a href="#cols2vec_example">Examples</a> |
| </ul> |
| </div> |
| |
| @about |
| Convert feature columns in a table into an array in a single column. |
| |
| Given a table with a number of feature columns, this function will create an |
| output table that contains the feature columns in an array. A summary |
| table will also be created that contains the names of the features combined into |
| array, so that this process can be reversed using the function vec2cols. |
| |
| @anchor cols2vec_usage |
| @usage |
| |
| <pre class="syntax"> |
| cols2vec( |
| source_table, |
| output_table, |
| list_of_features, |
| list_of_features_to_exclude, |
| cols_to_output |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the source data.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the generated table containing the output.</dd> |
| |
| <dt>list_of_features</dt> |
| <dd>TEXT. |
| Comma-separated string of column names or expressions to put into feature array. |
| Can also be '*' implying all columns are to be put into feature array (except |
| for the ones included in the next argument that lists exclusions). Type casting |
| will be done as per the regular type casting rules of the underlying database. |
| Array columns in the source table are not supported in the 'list_of_features' parameter. Also, |
| all of the features to be included must be of the same type and must not have null |
| values.</dd> |
| |
| <dt>list_of_features_to_exclude (optional)</dt> |
| <dd>TEXT. Default NULL. |
| Comma-separated string of column names to exclude from the feature array. Typically used |
| when 'list_of_features' is set to '*'.</dd> |
| |
| <dt>cols_to_output (optional)</dt> |
| <dd>TEXT. Default NULL. |
| Comma-separated string of column names from the source table to keep in the |
| output table, in addition to the feature array. To keep all columns from the |
| source table, use '*' for this parameter. </dd> |
| </dl> |
| |
| <b>Output table</b> |
| <br> |
| The output table produced by the cols2vec function contains the following columns: |
| <table class="output"> |
| <tr> |
| <th><...></th> |
| <td>Columns from source table, depending on which ones are kept (if any). |
| </td> |
| </tr> |
| <tr> |
| <th>feature_vector</th> |
| <td>Column that contains the feature array. |
| </tr> |
| </table> |
| |
| <b>Output summary table</b> |
| <br> |
| A summary table named <em><output_table>_summary</em> is also created |
| that contains: |
| <table class="output"> |
| <tr> |
| <th>source_table</th> |
| <td>Name of the table containing the source data.</td> |
| </tr> |
| <tr> |
| <th>list_of_features</th> |
| <td>List of features to put in vector.</td> |
| </tr> |
| <tr> |
| <th>list_of_features_to_exclude</th> |
| <td>Features specified by the user to exclude from 'list_of_features'.</td> |
| </tr> |
| <tr> |
| <th>feature_names</th> |
| <td>Names of the features that were nested (converted to a vector) in the output table.</td> |
| </tr> |
| </table> |
| |
| @anchor cols2vec_example |
| @par Examples |
| |
| -# Load sample data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS golf CASCADE; |
| CREATE TABLE golf ( |
| id integer NOT NULL, |
| "OUTLOOK" text, |
| temperature double precision, |
| humidity double precision, |
| "Temp_Humidity" double precision[], |
| clouds_airquality text[], |
| windy boolean, |
| class text, |
| observation_weight double precision |
| ); |
| INSERT INTO golf VALUES |
| (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), |
| (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), |
| (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), |
| (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), |
| (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), |
| (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), |
| (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), |
| (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), |
| (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), |
| (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), |
| (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), |
| (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), |
| (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), |
| (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); |
| </pre> |
| |
| -# Run cols2vec to combine the temperature and humidity columns into a single array feature. |
| <pre class="example"> |
| DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; |
| SELECT madlib.cols2vec( |
| 'golf', |
| 'cols2vec_result', |
| 'temperature, humidity' |
| ); |
| SELECT * FROM cols2vec_result; |
| </pre> |
| <pre class="result"> |
| feature_vector |
| ----------------+ |
| {85,85} |
| {80,90} |
| {83,78} |
| {70,96} |
| {68,80} |
| {65,70} |
| {64,65} |
| {72,95} |
| {69,70} |
| {75,80} |
| {75,70} |
| {72,90} |
| {81,75} |
| {71,80} |
| (14 rows) |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM cols2vec_result_summary; |
| \\x off |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+---------------------------------------------------------------- |
| source_table | golf |
| list_of_features | temperature, humidity |
| list_of_features_to_exclude | None |
| feature_names | {temperature,humidity} |
| </pre> |
| |
| -# Combine the temperature and humidity columns |
| and keep 2 other columns from source_table. |
| <pre class="example"> |
| DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; |
| SELECT madlib.cols2vec( |
| 'golf', |
| 'cols2vec_result', |
| 'temperature, humidity', |
| NULL, |
| 'id, "OUTLOOK"' |
| ); |
| SELECT * FROM cols2vec_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | OUTLOOK | feature_vector |
| ----+----------+---------------- |
| 1 | sunny | {85,85} |
| 2 | sunny | {80,90} |
| 3 | overcast | {83,78} |
| 4 | rain | {70,96} |
| 5 | rain | {68,80} |
| 6 | rain | {65,70} |
| 7 | overcast | {64,65} |
| 8 | sunny | {72,95} |
| 9 | sunny | {69,70} |
| 10 | rain | {75,80} |
| 11 | sunny | {75,70} |
| 12 | overcast | {72,90} |
| 13 | overcast | {81,75} |
| 14 | rain | {71,80} |
| (14 rows) |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM cols2vec_result_summary; |
| \\x off |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+---------------------------------------------------------------- |
| source_table | golf |
| list_of_features | temperature, humidity |
| list_of_features_to_exclude | None |
| feature_names | {temperature,humidity} |
| </pre> |
| |
| -# Combine all columns, excluding all columns that are not of type double precision. |
| <pre class="example"> |
| DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; |
| SELECT madlib.cols2vec( |
| 'golf', |
| 'cols2vec_result', |
| '*', |
| '"OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id', |
| 'id, "OUTLOOK"' |
| ); |
| SELECT * FROM cols2vec_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | OUTLOOK | feature_vector |
| ----+----------+---------------- |
| 1 | sunny | {85,85,5} |
| 2 | sunny | {80,90,5} |
| 3 | overcast | {83,78,1.5} |
| 4 | rain | {70,96,1} |
| 5 | rain | {68,80,1} |
| 6 | rain | {65,70,1} |
| 7 | overcast | {64,65,1.5} |
| 8 | sunny | {72,95,5} |
| 9 | sunny | {69,70,5} |
| 10 | rain | {75,80,1} |
| 11 | sunny | {75,70,5} |
| 12 | overcast | {72,90,1.5} |
| 13 | overcast | {81,75,1.5} |
| 14 | rain | {71,80,1} |
| (14 rows) |
| </pre> |
| View summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM cols2vec_result_summary; |
| \\x off |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+---------------------------------------------------------------- |
| source_table | golf |
| list_of_features | * |
| list_of_features_to_exclude | "OUTLOOK", "Temp_Humidity", clouds_airquality, windy, class, id |
| feature_names | {temperature,humidity,observation_weight} |
| </pre> |
| |
| -# Combine the temperature and humidity columns, exclude windy, and keep all of the |
| columns from the source table. |
| <pre class="example"> |
| DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; |
| SELECT madlib.cols2vec( |
| 'golf', |
| 'cols2vec_result', |
| 'windy, temperature, humidity', |
| 'windy', |
| '*' |
| ); |
| SELECT * FROM cols2vec_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | feature_vector |
| ----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+---------------- |
| 1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | {85,85} |
| 2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | {80,90} |
| 3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | {83,78} |
| 4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | {70,96} |
| 5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | {68,80} |
| 6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | {65,70} |
| 7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | {64,65} |
| 8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | {72,95} |
| 9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | {69,70} |
| 10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | {75,80} |
| 11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | {75,70} |
| 12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | {72,90} |
| 13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | {81,75} |
| 14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | {71,80} |
| (14 rows) |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM cols2vec_result_summary; |
| \\x off |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+----------------------------- |
| source_table | golf |
| list_of_features | windy, temperature, humidity |
| list_of_features_to_exclude | windy |
| feature_names | {temperature,humidity} |
| </pre> |
| This also shows that you can exclude features in 'list_of_features_to_exclude' |
| that are in the list of 'list_of_features'. This can be useful if the 'list_of_features' |
| is generated from an expression or subquery. |
| |
| -# Type casting works as per regular rules of the underlying database. |
| E.g, combining integer and double precisions columns will create a double precision feature vector. |
| For Boolean, do an explicit cast to the target type: |
| <pre class="example"> |
| DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; |
| SELECT madlib.cols2vec( |
| 'golf', |
| 'cols2vec_result', |
| 'windy::TEXT, class', |
| NULL, |
| 'id' |
| ); |
| SELECT * FROM cols2vec_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | feature_vector |
| ----+---------------------- |
| 1 | {false,"Don't Play"} |
| 2 | {true,"Don't Play"} |
| 3 | {false,Play} |
| 4 | {false,Play} |
| 5 | {false,Play} |
| 6 | {true,"Don't Play"} |
| 7 | {true,Play} |
| 8 | {false,"Don't Play"} |
| 9 | {false,Play} |
| 10 | {false,Play} |
| 11 | {true,Play} |
| 12 | {true,Play} |
| 13 | {false,Play} |
| 14 | {true,"Don't Play"} |
| (14 rows) |
| </pre> |
| |
| */ |
| |
| ------------------------------------------------------------------------- |
| -- cols2vec |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| list_of_features VARCHAR, |
| list_of_features_to_exclude VARCHAR, |
| cols_to_output VARCHAR |
| ) RETURNS void AS $$ |
| PythonFunctionBodyOnly(utilities, transform_vec_cols) |
| from utilities.control import MinWarning |
| with MinWarning('warning'): |
| cols2vec_obj = transform_vec_cols.cols2vec() |
| return cols2vec_obj.cols2vec(**globals()) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| list_of_features VARCHAR, |
| list_of_features_to_exclude VARCHAR |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, $4, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| list_of_features VARCHAR |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(message TEXT) |
| RETURNS text AS $$ |
| PythonFunctionBodyOnly(utilities, transform_vec_cols) |
| from utilities.control import MinWarning |
| with MinWarning('warning'): |
| return transform_vec_cols.cols2vec().cols2vec_help_message(schema_madlib, message) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec() |
| RETURNS text AS $$ |
| SELECT MADLIB_SCHEMA.cols2vec(''); |
| $$ language SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |