blob: 49064d3dc139a2dffb724c9356b18fc9b026ac65 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import plpy
from control import MinWarning
from internal.db_utils import is_col_1d_array
from internal.db_utils import quote_literal
from utilities import _assert
from utilities import add_postfix
from utilities import ANY_ARRAY
from utilities import is_valid_psql_type
from utilities import py_list_to_sql_string
from utilities import split_quoted_delimited_str
from validate_args import is_var_valid
from validate_args import explicit_bool_to_text
from validate_args import get_cols
from validate_args import get_cols_and_types
from validate_args import get_expr_type
from validate_args import input_tbl_valid
from validate_args import output_tbl_valid
from validate_args import table_exists
class vec_cols_helper:
def __init__(self):
self.all_cols = None
def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None):
"""
Get a list of columns based on the value of cols_to_process
Args:
@param cols_to_process: str, Either a * or a comma-separated list of col names
@param source_table: str, optional. Source table name
@param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude
from the source table, only used if cols_to_process is *
Returns:
A list of column names (or an empty list)
"""
# If cols_to_process is empty/None, return empty list
if not cols_to_process:
return []
if cols_to_process.strip() != "*":
# If cols_to_process is a comma separated list of names, return list
# of column names in cols_to_process.
return [col for col in split_quoted_delimited_str(cols_to_process)
if col not in split_quoted_delimited_str(exclude_cols)]
if source_table:
if not self.all_cols:
self.all_cols = get_cols(source_table)
return [col for col in self.all_cols
if col not in split_quoted_delimited_str(exclude_cols)]
return []
class vec2cols:
def __init__(self):
self.get_cols_helper = vec_cols_helper()
self.module_name = self.__class__.__name__
def validate_args(self, source_table, output_table, vector_col, feature_names,
cols_to_output):
"""
Validate args for vec2cols
"""
input_tbl_valid(source_table, self.module_name)
output_tbl_valid(output_table, self.module_name)
is_var_valid(source_table, cols_to_output)
is_var_valid(source_table, vector_col)
_assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY),
"{0}: vector_col should refer to an array.".format(self.module_name))
_assert(is_col_1d_array(source_table, vector_col),
"{0}: vector_col must be a 1-dimensional array.".format(self.module_name))
def get_names_for_split_output_cols(self, source_table, vector_col):
"""
Get list of names for the newly-split columns to include in the
output table.
Args:
@param: source_table, str. Source table
@param: vector_col, str. Column name containing the array input
@param: feature_names, list. Python list of the feature names to
use for the split elements in the vector_col array
"""
query = """
SELECT max(array_upper({0}, 1)) AS n_x
FROM {1}
""".format(vector_col, source_table)
max_array_length = plpy.execute(query)[0]["n_x"]
if not max_array_length:
plpy.error('{0}: Column to split ({1}) must not be an empty array'
.format(self.module_name, vector_col))
# Create custom col names for output columns, with prefix "f".
return ["f{0}".format(i+1) for i in range(max_array_length)]
def validate_output_cols(self, features_to_unnest, cols_to_keep):
# If there are more than 1600 columns for the output table, we give a
# warning as it might give an error
MAX_OUTPUT_COLUMN_COUNT = 1600
_assert(len(features_to_unnest)+len(cols_to_keep) < MAX_OUTPUT_COLUMN_COUNT,
"{0}: The output exceeds the max number of columns that " +
"can be created ({1})".format(self.module_name, MAX_OUTPUT_COLUMN_COUNT))
# Check if newly created col names have the same name as existing cols
duplicate_col_names = set(features_to_unnest).intersection(set(cols_to_keep))
_assert(len(duplicate_col_names) == 0,
"{0}: Conflicting column names. Column names in source "
"table cannot be {1}".format(self.module_name,
list(duplicate_col_names)))
def vec2cols(self, schema_madlib, source_table, output_table,
vector_col, feature_names, cols_to_output, **kwargs):
"""
Split up a column of array entries into multiple columns, each column
corresponding to one array position
Args:
@param: schema_madlib, str. The schema with madlib installed
@param: source_table, str. The source table
@param: output_table, str. The output table
@param: vector_col, str. The column with array entries to split up
@param: feature_names, list. Python list of the feature names to use
for the split elements in the vector_col array
@param: cols_to_output, str. Comma-separated list of the columns in
the source_table to include in the output_table
"""
self.validate_args(source_table, output_table, vector_col, feature_names,
cols_to_output)
# Get names of columns to use for the split vector_col
if not feature_names:
features_to_unnest = self.get_names_for_split_output_cols(
source_table, vector_col)
else:
features_to_unnest = feature_names
# Get the columns to keep from the source table
cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output,
source_table)
self.validate_output_cols(features_to_unnest, cols_to_keep)
# Construct the output query and populate the output table with all the
# correct parameters
select_new_cols = (', '.join(["{0}[{1}] AS {2}".format(vector_col, i + 1, f)
for i, f in enumerate(features_to_unnest)]))
cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep else ''
query = """
CREATE TABLE {output_table} AS
SELECT {cols_from_src_table} {select_new_cols}
FROM {source_table}
""".format(**locals())
plpy.execute(query)
def vec2cols_help_message(self, schema_madlib, message, **kwargs):
"""
Help message for vec2cols function
"""
summary_string = """
-----------------------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------------------
Functionality: Vector to Columns
The MADlib vec2cols function enables the user to split up a single column into
multiple columns, given that the input column contains array entries. For example,
if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table
will contain 3 different columns, one for each element of the array.
For more details on function usage:
SELECT {schema_madlib}.vec2cols('usage');
For a small example on using the function:
SELECT {schema_madlib}.vec2cols('example');
""".format(schema_madlib=schema_madlib)
usage_string = """
-----------------------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------------------
SELECT {schema_madlib}.vec2cols(
'source_table', -- str, Name of the source table that contains the data
'output_table', -- str, Name of the output view or table
'vector_col', -- str, Name of the array entry column to be split
'feature_names', -- array, Optional parameter to provide a text array of
-- the feature names for the newly split columns (if not
-- provided, default names f0, f1, ... will be used)
'cols_to_output' -- str, Optional parameter to specify any other columns
-- in the source_table to include in the output_table
-- (default none of them, also supports '*' as input)
""".format(schema_madlib=schema_madlib)
example_string = """
-----------------------------------------------------------------------------------
EXAMPLE
-----------------------------------------------------------------------------------
-- Create an input data set:
DROP TABLE IF EXISTS golf CASCADE;
CREATE TABLE golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
-- Call the vec2cols function on the 'clouds_airquality' column, to split it up
DROP TABLE IF EXISTS output_table;
SELECT {schema_madlib}.vec2cols(
'golf', -- source table
'output_table', -- output table
'clouds_airquality', -- column with array entries to split
ARRAY['a', 'b'], -- feature_names array (will use 'a' to name the first new column, and 'b' for the second)
'"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list)
);
SELECT * FROM output_table ORDER BY id;
OUTLOOK | id | a | b
----------+----+--------+-----------
sunny | 1 | none | unhealthy
sunny | 2 | none | moderate
overcast | 3 | low | moderate
rain | 4 | low | moderate
rain | 5 | medium | good
rain | 6 | low | unhealthy
overcast | 7 | medium | moderate
sunny | 8 | high | unhealthy
sunny | 9 | high | good
rain | 10 | medium | good
sunny | 11 | none | good
overcast | 12 | medium | moderate
overcast | 13 | medium | moderate
rain | 14 | low | unhealthy
(14 rows)
""".format(schema_madlib=schema_madlib)
if not message:
return summary_string
elif message.lower() in ('usage', 'help', '?'):
return usage_string
elif message.lower() in ('example', 'examples'):
return example_string
else:
return """
No such option. Use "SELECT {schema_madlib}.vec2cols()" for help.
""".format(schema_madlib=schema_madlib)
class cols2vec:
def __init__(self):
self.get_cols_helper = vec_cols_helper()
self.module_name = self.__class__.__name__
def validate_args(self, source_table, output_table,
list_of_features, list_of_features_to_exclude, cols_to_output):
"""
Function to validate input parameters
"""
input_tbl_valid(source_table, self.module_name)
output_tbl_valid(output_table, self.module_name)
_assert(list_of_features and list_of_features.strip(), "{0}: List of "
"features cannot be empty".format(self.module_name))
if list_of_features.strip() != '*':
is_var_valid(source_table, list_of_features)
if list_of_features_to_exclude:
if list_of_features_to_exclude.strip() == "*":
plpy.error("{0}: Cannot exclude all columns from being "
"features".format(self.module_name))
elif list_of_features.strip() != '*':
plpy.info("{0} NOTICE: will exclude given column(s) even though "
"list of features was not *".format(self.module_name))
is_var_valid(source_table, list_of_features_to_exclude)
is_var_valid(source_table, cols_to_output)
def get_and_validate_feature_types(self, source_table):
"""
This function validates the types of all of the features_to_nest,
disallowing any features of type array. Lets underlying platform
decide whether to allow 2 elements of different types be put into
an array together (ex. integer and text features, if put together,
will yield error by the underlying platform)
"""
all_cols_and_types = get_cols_and_types(source_table)
distinct_types = set([col_type[1] for col_type in all_cols_and_types
if col_type[0] in self.features_to_nest])
_assert(not any(is_valid_psql_type(expr_type, ANY_ARRAY)
for expr_type in distinct_types),
"{0}: Feature columns to nest cannot be of type array"
.format(self.module_name))
if len(distinct_types) != 1 and 'boolean' in distinct_types:
self.features_to_nest = explicit_bool_to_text(source_table, self.features_to_nest, self.schema_madlib)
def cols2vec(self, schema_madlib, source_table, output_table, list_of_features,
list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
"""
Args:
@param schema_madlib: Name of MADlib schema
@param source_table: Name of table containing input data
@param output_table: Name of table to output the results
@param list_of_features: Comma-separated string of column names or
expressions to put into feature array.
Can also be a '*' implying all columns
are to be put into feature array
@param list_of_features_to_exclude: Comma-separated string of column names
to exclude from the feature array
@param cols_to_output: Comma-separated string of column names
from the source table to keep in the output table,
in addition to the feature array
"""
self.schema_madlib = schema_madlib
self.validate_args(source_table, output_table, list_of_features,
list_of_features_to_exclude, cols_to_output)
self.features_to_nest = self.get_cols_helper.get_cols_as_list(list_of_features,
source_table, list_of_features_to_exclude)
self.get_and_validate_feature_types(source_table)
cols_to_keep = ', '.join(self.get_cols_helper.get_cols_as_list(cols_to_output,
source_table))+", " if cols_to_output else ''
feature_list_str = "ARRAY[ {val} ]".format(val=','.join(map(str, self.features_to_nest)))
plpy.execute("""
CREATE TABLE {output_table} AS
SELECT {cols_to_keep}
{feature_list_str} AS feature_vector
FROM {source_table}
""".format(**locals()))
feature_cols = py_list_to_sql_string(
[quote_literal(f) for f in self.features_to_nest], "TEXT", True)
output_table_summary = add_postfix(output_table, "_summary")
# Dollar-quote the text to allow single-quotes without escaping
dq = "$__MADLIB_OUTER__$"
plpy.execute("""
CREATE TABLE {output_table_summary} AS
SELECT
{dq}{source_table}{dq}::TEXT AS source_table,
{dq}{list_of_features}{dq}::TEXT AS list_of_features,
{dq}{list_of_features_to_exclude}{dq}::TEXT AS list_of_features_to_exclude,
{feature_cols} AS feature_names
""".format(**locals()))
def cols2vec_help_message(self, schema_madlib, message, **kwargs):
"""
Help message for cols2vec function
"""
summary_string = """
-----------------------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------------------
Functionality: Columns to Vector
The MADlib cols2vec function, given a table with a number of feature columns, will
create an output table that contains the feature columns in an array. A summary table
will also be created that contains the names of the features combined into array,
so that this process can be reversed using the function vec2cols.
For more details on function usage:
SELECT {schema_madlib}.cols2vec('usage');
For a small example on using the function:
SELECT {schema_madlib}.cols2vec('example');
""".format(schema_madlib=schema_madlib)
usage_string = """
-----------------------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------------------
SELECT {schema_madlib}.cols2vec(
'source_table', -- str, Name of the source table that contains the data
'output_table', -- str, Name of the output view or table
'list_of_features', -- Comma-separated string of column names or
expressions to put into feature array.
Can also be a '*' implying all columns
are to be put into feature array
'list_of_features_to_exclude', -- Comma-separated string of column names
to exclude from the feature array
'cols_to_output', -- Comma-separated string of column names
from the source table to keep in the output table,
in addition to the feature array
""".format(schema_madlib=schema_madlib)
example_string = """
-----------------------------------------------------------------------------------
EXAMPLE
-----------------------------------------------------------------------------------
-- Create an input data set:
DROP TABLE IF EXISTS golf CASCADE;
CREATE TABLE golf (
id integer NOT NULL,
"OUTLOOK" text,
temperature double precision,
humidity double precision,
"Temp_Humidity" double precision[],
clouds_airquality text[],
windy boolean,
class text,
observation_weight double precision
);
INSERT INTO golf VALUES
(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);
-- Call the cols2vec function on the temperature and humidity columns, to nest them.
-- Also include "windy" in features list, but exclude this feature using the exclude field
DROP TABLE IF EXISTS output_table;
SELECT {schema_madlib}.cols2vec(
'golf', -- source table
'output_table', -- output table
'windy, temperature, humidity', -- feature list to nest
'windy', -- features to exclude, from the above feature list
'"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list)
);
SELECT * FROM output_table ORDER BY id;
OUTLOOK | id | feature_vector
----------+----+----------------
sunny | 1 | {{85,85}}
sunny | 2 | {{80,90}}
overcast | 3 | {{83,78}}
rain | 4 | {{70,96}}
rain | 5 | {{68,80}}
rain | 6 | {{65,70}}
overcast | 7 | {{64,65}}
sunny | 8 | {{72,95}}
sunny | 9 | {{69,70}}
rain | 10 | {{75,80}}
sunny | 11 | {{75,70}}
overcast | 12 | {{72,90}}
overcast | 13 | {{81,75}}
rain | 14 | {{71,80}}
(14 rows)
""".format(schema_madlib=schema_madlib)
if not message:
return summary_string
elif message.lower() in ('usage', 'help', '?'):
return usage_string
elif message.lower() in ('example', 'examples'):
return example_string
else:
return """
No such option. Use "SELECT {schema_madlib}.cols2vec()" for help.
""".format(schema_madlib=schema_madlib)