| /* ----------------------------------------------------------------------- */ |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * @file sessionize.sql_in |
| * |
| * @brief SQL functions for sessionization functions |
| * @date May 2016 |
| * |
| */ |
| /* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_vec2cols |
| |
| @brief Converts a feature array in a single column of an output table into multiple columns. |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#vec2cols_syntax">Syntax</a> |
| <li class="level1"><a href="#vec2cols_usage">Usage</a> |
| <li class="level1"><a href="#vec2cols_example">Examples</a> |
| </ul> |
| </div> |
| |
| @about |
| Converts a feature array in a single column into multiple columns. |
| This process can be used to reverse the function cols2vec. |
| |
| Given a table with a column of type array, this function will create an output |
| table that splits this array into multiple columns, one per array element. |
| It includes the option to name the new feature columns, and to include |
| columns from the original table in the output. |
| |
| @anchor vec2cols_usage |
| @usage |
| |
| <pre class="syntax"> |
| vec2cols( |
| source_table, |
| output_table, |
| vector_col, |
| feature_names, |
| cols_to_output |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the source data.</tt>. |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the generated table containing the output. If a table with the |
| same name already exists, an error will be returned.</tt> |
| |
| <dt>vector_col</dt> |
| <dd>TEXT. Name of the column containing the feature array. Must be a one-dimensional array.</tt> |
| |
| <dt>feature_names (optional)</dt> |
| <dd>TEXT[]. Array of names associated with the feature array. Note that |
| this array exists in the summary table created by the function 'cols2vec'. If |
| the 'feature_names' array is not specified, |
| column names will be automatically generated of |
| the form 'f1, f2, ...fn'.</tt> |
| @note If you specify the 'feature_names' parameter, you will get exactly that number of |
| feature columns in the 'output_table'. It means feature arrays from the 'vector_col' may be |
| padded or truncated, if a particular feature array size does not match the target |
| number of feature columns. <br><br>If you do not specify the 'feature names' parameter, |
| the number of feature columns generated |
| in the 'output_table' will be the maximum array size from 'vector_col'. |
| Feature arrays that are less than this maximum will be padded. |
| |
| <dt>cols_to_output (optional)</dt> |
| <dd>TEXT, default NULL. Comma-separated string of column names |
| from the source table to keep in the |
| output table, in addition to the feature columns. |
| To keep all columns from the source table, use '*'. |
| The total number of columns in a table cannot exceed the |
| PostgreSQL limits.</tt> |
| </dd> |
| </dl> |
| |
| <b>Output table</b> |
| <br> |
| The output table produced by the vec2cols function contains the following columns: |
| <table class="output"> |
| <tr> |
| <th><...></th> |
| <td>Columns from source table, depending on which ones are kept (if any). |
| </td> |
| </tr> |
| <tr> |
| <th>feature columns</th> |
| <td>Columns for each of the features in 'vector_col'. Column type |
| will depend on the feature array type in the source table. Column |
| naming will depend on whether the parameter 'feature_names' is used. |
| </tr> |
| </table> |
| |
| |
| @anchor vec2cols_example |
| @par Examples |
| -# Load sample data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS golf CASCADE; |
| CREATE TABLE golf ( |
| id integer NOT NULL, |
| "OUTLOOK" text, |
| temperature double precision, |
| humidity double precision, |
| "Temp_Humidity" double precision[], |
| clouds_airquality text[], |
| windy boolean, |
| class text, |
| observation_weight double precision |
| ); |
| INSERT INTO golf VALUES |
| (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0), |
| (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0), |
| (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5), |
| (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0), |
| (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), |
| (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0), |
| (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), |
| (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0), |
| (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0), |
| (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0), |
| (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0), |
| (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5), |
| (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5), |
| (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0); |
| </pre> |
| |
| -# Split the column "clouds_airquality" into new columns |
| called "clouds" and "air_quality". Also keep columns id |
| and "OUTLOOK" from the source table |
| <pre class="example"> |
| DROP TABLE IF EXISTS vec2cols_result; |
| SELECT madlib.vec2cols( |
| 'golf', -- source table |
| 'vec2cols_result', -- output table |
| 'clouds_airquality', -- column with array entries to split |
| ARRAY['clouds', 'air_quality'], -- feature names |
| 'id, "OUTLOOK"' -- columns to keep from source table |
| ); |
| SELECT * FROM vec2cols_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | OUTLOOK | clouds | air_quality |
| ----+----------+--------+------------- |
| 1 | sunny | none | unhealthy |
| 2 | sunny | none | moderate |
| 3 | overcast | low | moderate |
| 4 | rain | low | moderate |
| 5 | rain | medium | good |
| 6 | rain | low | unhealthy |
| 7 | overcast | medium | moderate |
| 8 | sunny | high | unhealthy |
| 9 | sunny | high | good |
| 10 | rain | medium | good |
| 11 | sunny | none | good |
| 12 | overcast | medium | moderate |
| 13 | overcast | medium | moderate |
| 14 | rain | low | unhealthy |
| (14 rows) |
| </pre> |
| |
| -# Similar to the previous example, except now |
| we keep all columns from source table and do not |
| specify the feature names, so that default names |
| are created. |
| <pre class="example"> |
| DROP TABLE IF EXISTS vec2cols_result; |
| SELECT madlib.vec2cols( |
| 'golf', -- source table |
| 'vec2cols_result', -- output table |
| 'clouds_airquality', -- column with array entries to split |
| NULL, -- feature names |
| '*' -- columns to keep from source table |
| ); |
| SELECT * FROM vec2cols_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | OUTLOOK | temperature | humidity | Temp_Humidity | clouds_airquality | windy | class | observation_weight | f1 | f2 |
| ----+----------+-------------+----------+---------------+-------------------+-------+------------+--------------------+--------+----------- |
| 1 | sunny | 85 | 85 | {85,85} | {none,unhealthy} | f | Don't Play | 5 | none | unhealthy |
| 2 | sunny | 80 | 90 | {80,90} | {none,moderate} | t | Don't Play | 5 | none | moderate |
| 3 | overcast | 83 | 78 | {83,78} | {low,moderate} | f | Play | 1.5 | low | moderate |
| 4 | rain | 70 | 96 | {70,96} | {low,moderate} | f | Play | 1 | low | moderate |
| 5 | rain | 68 | 80 | {68,80} | {medium,good} | f | Play | 1 | medium | good |
| 6 | rain | 65 | 70 | {65,70} | {low,unhealthy} | t | Don't Play | 1 | low | unhealthy |
| 7 | overcast | 64 | 65 | {64,65} | {medium,moderate} | t | Play | 1.5 | medium | moderate |
| 8 | sunny | 72 | 95 | {72,95} | {high,unhealthy} | f | Don't Play | 5 | high | unhealthy |
| 9 | sunny | 69 | 70 | {69,70} | {high,good} | f | Play | 5 | high | good |
| 10 | rain | 75 | 80 | {75,80} | {medium,good} | f | Play | 1 | medium | good |
| 11 | sunny | 75 | 70 | {75,70} | {none,good} | t | Play | 5 | none | good |
| 12 | overcast | 72 | 90 | {72,90} | {medium,moderate} | t | Play | 1.5 | medium | moderate |
| 13 | overcast | 81 | 75 | {81,75} | {medium,moderate} | f | Play | 1.5 | medium | moderate |
| 14 | rain | 71 | 80 | {71,80} | {low,unhealthy} | t | Don't Play | 1 | low | unhealthy |
| (14 rows) |
| </pre> |
| |
| -# Now let's run cols2vec then reverse it using vec2cols. |
| In this case we will get feature names from the cols2vec summary table. |
| First run cols2vec: |
| <pre class="example"> |
| DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary; |
| SELECT madlib.cols2vec( |
| 'golf', |
| 'cols2vec_result', |
| 'temperature, humidity', |
| NULL, |
| 'id, temperature, humidity' |
| ); |
| SELECT * FROM cols2vec_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | temperature | humidity | feature_vector |
| ----+-------------+----------+---------------- |
| 1 | 85 | 85 | {85,85} |
| 2 | 80 | 90 | {80,90} |
| 3 | 83 | 78 | {83,78} |
| 4 | 70 | 96 | {70,96} |
| 5 | 68 | 80 | {68,80} |
| 6 | 65 | 70 | {65,70} |
| 7 | 64 | 65 | {64,65} |
| 8 | 72 | 95 | {72,95} |
| 9 | 69 | 70 | {69,70} |
| 10 | 75 | 80 | {75,80} |
| 11 | 75 | 70 | {75,70} |
| 12 | 72 | 90 | {72,90} |
| 13 | 81 | 75 | {81,75} |
| 14 | 71 | 80 | {71,80} |
| (14 rows) |
| </pre> |
| View the summary table with the feature_names dictionary: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM cols2vec_result_summary; |
| \\x off |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---------------+----------------------- |
| source_table | golf |
| list_of_features | temperature, humidity |
| list_of_features_to_exclude | None |
| feature_names | {temperature,humidity} |
| </pre> |
| Now use feature_names from the summary table above |
| to name the columns of the split array: |
| <pre class="example"> |
| DROP TABLE IF EXISTS vec2cols_result; |
| SELECT madlib.vec2cols( |
| 'cols2vec_result', -- source table containing the feature vector |
| 'vec2cols_result', -- output table |
| 'feature_vector', -- column with array entries to split |
| (SELECT feature_names from cols2vec_result_summary), -- feature_names from summary table of cols2vec |
| 'id' -- columns to keep from source table |
| ); |
| SELECT * FROM vec2cols_result ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | temperature | humidity |
| ----+-------------+---------- |
| 1 | 85 | 85 |
| 2 | 80 | 90 |
| 3 | 83 | 78 |
| 4 | 70 | 96 |
| 5 | 68 | 80 |
| 6 | 65 | 70 |
| 7 | 64 | 65 |
| 8 | 72 | 95 |
| 9 | 69 | 70 |
| 10 | 75 | 80 |
| 11 | 75 | 70 |
| 12 | 72 | 90 |
| 13 | 81 | 75 |
| 14 | 71 | 80 |
| (14 rows) |
| </pre> |
| This is the same as the format of the original 'golf' dataset |
| that we started with. |
| |
| */ |
| |
| ------------------------------------------------------------------------- |
| -- vec2cols |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| vector_col VARCHAR, |
| feature_names TEXT[], |
| cols_to_output VARCHAR |
| ) RETURNS void AS $$ |
| PythonFunctionBodyOnly(utilities, transform_vec_cols) |
| from utilities.control import MinWarning |
| with MinWarning('warning'): |
| vec2cols_obj = transform_vec_cols.vec2cols() |
| return vec2cols_obj.vec2cols(**globals()) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| vector_col VARCHAR, |
| feature_names TEXT[] |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.vec2cols($1,$2,$3,$4,NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| vector_col VARCHAR |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.vec2cols($1,$2,$3,NULL,NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols(message TEXT) |
| RETURNS text AS $$ |
| PythonFunctionBodyOnly(utilities, transform_vec_cols) |
| return transform_vec_cols.vec2cols().vec2cols_help_message(schema_madlib, message) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.vec2cols() |
| RETURNS text AS $$ |
| SELECT MADLIB_SCHEMA.vec2cols(''); |
| $$ language SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |