src/ports/postgres/modules/utilities/transform_vec_cols.py_in - madlib - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import plpy
 from control import MinWarning
 from internal.db_utils import is_col_1d_array
 from internal.db_utils import quote_literal
 from utilities import _assert
 from utilities import add_postfix
 from utilities import ANY_ARRAY
 from utilities import is_valid_psql_type
 from utilities import py_list_to_sql_string
 from utilities import split_quoted_delimited_str
 from validate_args import is_var_valid
 from validate_args import explicit_bool_to_text
 from validate_args import get_cols
 from validate_args import get_cols_and_types
 from validate_args import get_expr_type
 from validate_args import input_tbl_valid
 from validate_args import output_tbl_valid
 from validate_args import table_exists

 class vec_cols_helper:
     def __init__(self):
         self.all_cols = None

     def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None):
         """
             Get a list of columns based on the value of cols_to_process
             Args:
             @param cols_to_process: str, Either a * or a comma-separated list of col names
             @param source_table: str, optional. Source table name
             @param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude
                                  from the source table, only used if cols_to_process is *
             Returns:
             A list of column names (or an empty list)
         """
         # If cols_to_process is empty/None, return empty list
         if not cols_to_process:
             return []
         if cols_to_process.strip() != "*":
             # If cols_to_process is a comma separated list of names, return list
             # of column names in cols_to_process.
             return [col for col in split_quoted_delimited_str(cols_to_process)
                     if col not in split_quoted_delimited_str(exclude_cols)]
         if source_table:
             if not self.all_cols:
                 self.all_cols = get_cols(source_table)
             return [col for col in self.all_cols
                     if col not in split_quoted_delimited_str(exclude_cols)]
         return []

 class vec2cols:
     def __init__(self):
         self.get_cols_helper = vec_cols_helper()
         self.module_name = self.__class__.__name__

     def validate_args(self, source_table, output_table, vector_col, feature_names,
                       cols_to_output):
         """
             Validate args for vec2cols
         """
         input_tbl_valid(source_table, self.module_name)
         output_tbl_valid(output_table, self.module_name)
         is_var_valid(source_table, cols_to_output)
         is_var_valid(source_table, vector_col)
         _assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY),
             "{0}: vector_col should refer to an array.".format(self.module_name))
         _assert(is_col_1d_array(source_table, vector_col),
             "{0}: vector_col must be a 1-dimensional array.".format(self.module_name))

     def get_names_for_split_output_cols(self, source_table, vector_col):
         """
             Get list of names for the newly-split columns to include in the
             output table.
             Args:
             @param: source_table, str. Source table
             @param: vector_col, str. Column name containing the array input
             @param: feature_names, list. Python list of the feature names to
                     use for the split elements in the vector_col array
         """
         query = """
             SELECT max(array_upper({0}, 1)) AS n_x
             FROM {1}
         """.format(vector_col, source_table)
         max_array_length = plpy.execute(query)[0]["n_x"]
         if not max_array_length:
             plpy.error('{0}: Column to split ({1}) must not be an empty array'
                 .format(self.module_name, vector_col))
         # Create custom col names for output columns, with prefix "f".
         return ["f{0}".format(i+1) for i in range(max_array_length)]

     def validate_output_cols(self, features_to_unnest, cols_to_keep):
         # If there are more than 1600 columns for the output table, we give a
         # warning as it might give an error
         MAX_OUTPUT_COLUMN_COUNT = 1600
         _assert(len(features_to_unnest)+len(cols_to_keep) < MAX_OUTPUT_COLUMN_COUNT,
                 "{0}: The output exceeds the max number of columns that " +
                 "can be created ({1})".format(self.module_name, MAX_OUTPUT_COLUMN_COUNT))
         # Check if newly created col names have the same name as existing cols
         duplicate_col_names = set(features_to_unnest).intersection(set(cols_to_keep))
         _assert(len(duplicate_col_names) == 0,
                 "{0}: Conflicting column names. Column names in source "
                 "table cannot be {1}".format(self.module_name,
                                             list(duplicate_col_names)))

     def vec2cols(self, schema_madlib, source_table, output_table,
                  vector_col, feature_names, cols_to_output, **kwargs):
         """
             Split up a column of array entries into multiple columns, each column
             corresponding to one array position
             Args:
             @param: schema_madlib, str. The schema with madlib installed
             @param: source_table, str. The source table
             @param: output_table, str. The output table
             @param: vector_col, str. The column with array entries to split up
             @param: feature_names, list. Python list of the feature names to use
                     for the split elements in the vector_col array
             @param: cols_to_output, str. Comma-separated list of the columns in
                     the source_table to include in the output_table
         """
         self.validate_args(source_table, output_table, vector_col, feature_names,
                            cols_to_output)

         # Get names of columns to use for the split vector_col
         if not feature_names:
             features_to_unnest = self.get_names_for_split_output_cols(
                 source_table, vector_col)
         else:
             features_to_unnest = feature_names

         # Get the columns to keep from the source table
         cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output,
                        source_table)

         self.validate_output_cols(features_to_unnest, cols_to_keep)

         # Construct the output query and populate the output table with all the
         # correct parameters
         select_new_cols = (', '.join(["{0}[{1}] AS {2}".format(vector_col, i + 1, f)
                                      for i, f in enumerate(features_to_unnest)]))
         cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep else ''
         query = """
         CREATE TABLE {output_table} AS
         SELECT {cols_from_src_table} {select_new_cols}
         FROM {source_table}
         """.format(**locals())
         plpy.execute(query)

     def vec2cols_help_message(self, schema_madlib, message, **kwargs):
         """
             Help message for vec2cols function
         """
         summary_string = """
 -----------------------------------------------------------------------------------
                                     SUMMARY
 -----------------------------------------------------------------------------------
 Functionality: Vector to Columns

 The MADlib vec2cols function enables the user to split up a single column into
 multiple columns, given that the input column contains array entries. For example,
 if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table
 will contain 3 different columns, one for each element of the array.

 For more details on function usage:
     SELECT {schema_madlib}.vec2cols('usage');

 For a small example on using the function:
     SELECT {schema_madlib}.vec2cols('example');
     """.format(schema_madlib=schema_madlib)

         usage_string = """
 -----------------------------------------------------------------------------------
                                     USAGE
 -----------------------------------------------------------------------------------
 SELECT {schema_madlib}.vec2cols(
     'source_table',     -- str, Name of the source table that contains the data
     'output_table',     -- str, Name of the output view or table
     'vector_col',       -- str, Name of the array entry column to be split
     'feature_names',    -- array, Optional parameter to provide a text array of
                         -- the feature names for the newly split columns (if not
                         -- provided, default names f0, f1, ... will be used)
     'cols_to_output'    -- str, Optional parameter to specify any other columns
                         -- in the source_table to include in the output_table
                         -- (default none of them, also supports '*' as input)
     """.format(schema_madlib=schema_madlib)

         example_string = """
 -----------------------------------------------------------------------------------
                                     EXAMPLE
 -----------------------------------------------------------------------------------
 -- Create an input data set:

 DROP TABLE IF EXISTS golf CASCADE;
 CREATE TABLE golf (
     id integer NOT NULL,
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
     "Temp_Humidity" double precision[],
     clouds_airquality text[],
     windy boolean,
     class text,
     observation_weight double precision
 );
 INSERT INTO golf VALUES
 (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
 (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
 (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
 (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
 (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
 (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
 (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
 (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
 (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
 (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
 (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
 (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
 (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
 (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);

 -- Call the vec2cols function on the 'clouds_airquality' column, to split it up

 DROP TABLE IF EXISTS output_table;
 SELECT {schema_madlib}.vec2cols(
     'golf',               -- source table
     'output_table',       -- output table
     'clouds_airquality',  -- column with array entries to split
     ARRAY['a', 'b'],      -- feature_names array (will use 'a' to name the first new column, and 'b' for the second)
     '"OUTLOOK", id'       -- columns to keep from source table (as a comma-separated list)
 );

 SELECT * FROM output_table ORDER BY id;
  OUTLOOK  | id |   a    |     b
 ----------+----+--------+-----------
  sunny    |  1 | none   | unhealthy
  sunny    |  2 | none   | moderate
  overcast |  3 | low    | moderate
  rain     |  4 | low    | moderate
  rain     |  5 | medium | good
  rain     |  6 | low    | unhealthy
  overcast |  7 | medium | moderate
  sunny    |  8 | high   | unhealthy
  sunny    |  9 | high   | good
  rain     | 10 | medium | good
  sunny    | 11 | none   | good
  overcast | 12 | medium | moderate
  overcast | 13 | medium | moderate
  rain     | 14 | low    | unhealthy
 (14 rows)
 """.format(schema_madlib=schema_madlib)

         if not message:
             return summary_string
         elif message.lower() in ('usage', 'help', '?'):
             return usage_string
         elif message.lower() in ('example', 'examples'):
             return example_string
         else:
             return """
 No such option. Use "SELECT {schema_madlib}.vec2cols()" for help.
         """.format(schema_madlib=schema_madlib)

 class cols2vec:
     def __init__(self):
         self.get_cols_helper = vec_cols_helper()
         self.module_name = self.__class__.__name__

     def validate_args(self, source_table, output_table,
                       list_of_features, list_of_features_to_exclude, cols_to_output):
         """
             Function to validate input parameters
         """
         input_tbl_valid(source_table, self.module_name)
         output_tbl_valid(output_table, self.module_name)

         _assert(list_of_features and list_of_features.strip(), "{0}: List of "
                     "features cannot be empty".format(self.module_name))
         if list_of_features.strip() != '*':
             is_var_valid(source_table, list_of_features)

         if list_of_features_to_exclude:
             if list_of_features_to_exclude.strip() == "*":
                 plpy.error("{0}: Cannot exclude all columns from being "
                     "features".format(self.module_name))
             elif list_of_features.strip() != '*':
                 plpy.info("{0} NOTICE: will exclude given column(s) even though "
                     "list of features was not *".format(self.module_name))

         is_var_valid(source_table, list_of_features_to_exclude)
         is_var_valid(source_table, cols_to_output)

     def get_and_validate_feature_types(self, source_table):
         """
             This function validates the types of all of the features_to_nest,
             disallowing any features of type array. Lets underlying platform
             decide whether to allow 2 elements of different types be put into
             an array together (ex. integer and text features, if put together,
             will yield error by the underlying platform)
         """
         all_cols_and_types = get_cols_and_types(source_table)
         distinct_types = set([col_type[1] for col_type in all_cols_and_types
             if col_type[0] in self.features_to_nest])
         _assert(not any(is_valid_psql_type(expr_type, ANY_ARRAY)
                     for expr_type in distinct_types),
                 "{0}: Feature columns to nest cannot be of type array"
                 .format(self.module_name))

         if len(distinct_types) != 1 and 'boolean' in distinct_types:
             self.features_to_nest = explicit_bool_to_text(source_table, self.features_to_nest, self.schema_madlib)

     def cols2vec(self, schema_madlib, source_table, output_table, list_of_features,
                  list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
         """
         Args:
             @param schema_madlib: Name of MADlib schema
             @param source_table: Name of table containing input data
             @param output_table: Name of table to output the results
             @param list_of_features: Comma-separated string of column names or
                                      expressions to put into feature array.
                                      Can also be a '*' implying all columns
                                      are to be put into feature array
             @param list_of_features_to_exclude: Comma-separated string of column names
                                                 to exclude from the feature array
             @param cols_to_output: Comma-separated string of column names
                                    from the source table to keep in the output table,
                                    in addition to the feature array
         """
         self.schema_madlib = schema_madlib
         self.validate_args(source_table, output_table, list_of_features,
                            list_of_features_to_exclude, cols_to_output)

         self.features_to_nest = self.get_cols_helper.get_cols_as_list(list_of_features,
                                 source_table, list_of_features_to_exclude)
         self.get_and_validate_feature_types(source_table)

         cols_to_keep = ', '.join(self.get_cols_helper.get_cols_as_list(cols_to_output,
                        source_table))+", " if cols_to_output else ''

         feature_list_str = "ARRAY[ {val} ]".format(val=','.join(map(str, self.features_to_nest)))
         plpy.execute("""
                 CREATE TABLE {output_table} AS
                     SELECT {cols_to_keep}
                            {feature_list_str} AS feature_vector
                     FROM {source_table}
                 """.format(**locals()))

         feature_cols = py_list_to_sql_string(
                 [quote_literal(f) for f in self.features_to_nest], "TEXT", True)

         output_table_summary = add_postfix(output_table, "_summary")
         # Dollar-quote the text to allow single-quotes without escaping
         dq = "$__MADLIB_OUTER__$"
         plpy.execute("""
             CREATE TABLE {output_table_summary} AS
             SELECT
                 {dq}{source_table}{dq}::TEXT AS source_table,
                 {dq}{list_of_features}{dq}::TEXT AS list_of_features,
                 {dq}{list_of_features_to_exclude}{dq}::TEXT AS list_of_features_to_exclude,
                 {feature_cols} AS feature_names
             """.format(**locals()))

     def cols2vec_help_message(self, schema_madlib, message, **kwargs):
         """
             Help message for cols2vec function
         """
         summary_string = """
 -----------------------------------------------------------------------------------
                                     SUMMARY
 -----------------------------------------------------------------------------------
 Functionality: Columns to Vector

 The MADlib cols2vec function, given a table with a number of feature columns, will
 create an output table that contains the feature columns in an array. A summary table
 will also be created that contains the names of the features combined into array,
 so that this process can be reversed using the function vec2cols.

 For more details on function usage:
     SELECT {schema_madlib}.cols2vec('usage');

 For a small example on using the function:
     SELECT {schema_madlib}.cols2vec('example');
     """.format(schema_madlib=schema_madlib)

         usage_string = """
 -----------------------------------------------------------------------------------
                                     USAGE
 -----------------------------------------------------------------------------------
 SELECT {schema_madlib}.cols2vec(
     'source_table',     -- str, Name of the source table that contains the data
     'output_table',     -- str, Name of the output view or table
     'list_of_features', -- Comma-separated string of column names or
                            expressions to put into feature array.
                            Can also be a '*' implying all columns
                            are to be put into feature array
     'list_of_features_to_exclude', -- Comma-separated string of column names
                                       to exclude from the feature array
     'cols_to_output',   -- Comma-separated string of column names
                            from the source table to keep in the output table,
                            in addition to the feature array
     """.format(schema_madlib=schema_madlib)

         example_string = """
 -----------------------------------------------------------------------------------
                                     EXAMPLE
 -----------------------------------------------------------------------------------
 -- Create an input data set:

 DROP TABLE IF EXISTS golf CASCADE;
 CREATE TABLE golf (
     id integer NOT NULL,
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
     "Temp_Humidity" double precision[],
     clouds_airquality text[],
     windy boolean,
     class text,
     observation_weight double precision
 );
 INSERT INTO golf VALUES
 (1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
 (2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
 (3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
 (4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
 (5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
 (6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
 (7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
 (8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
 (9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
 (10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
 (11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
 (12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
 (13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
 (14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);

 -- Call the cols2vec function on the temperature and humidity columns, to nest them.
 -- Also include "windy" in features list, but exclude this feature using the exclude field

 DROP TABLE IF EXISTS output_table;
 SELECT {schema_madlib}.cols2vec(
     'golf',               -- source table
     'output_table',       -- output table
     'windy, temperature, humidity',  -- feature list to nest
     'windy',              -- features to exclude, from the above feature list
     '"OUTLOOK", id'       -- columns to keep from source table (as a comma-separated list)
 );

 SELECT * FROM output_table ORDER BY id;
  OUTLOOK  | id | feature_vector
 ----------+----+----------------
  sunny    |  1 | {{85,85}}
  sunny    |  2 | {{80,90}}
  overcast |  3 | {{83,78}}
  rain     |  4 | {{70,96}}
  rain     |  5 | {{68,80}}
  rain     |  6 | {{65,70}}
  overcast |  7 | {{64,65}}
  sunny    |  8 | {{72,95}}
  sunny    |  9 | {{69,70}}
  rain     | 10 | {{75,80}}
  sunny    | 11 | {{75,70}}
  overcast | 12 | {{72,90}}
  overcast | 13 | {{81,75}}
  rain     | 14 | {{71,80}}
 (14 rows)
 """.format(schema_madlib=schema_madlib)

         if not message:
             return summary_string
         elif message.lower() in ('usage', 'help', '?'):
             return usage_string
         elif message.lower() in ('example', 'examples'):
             return example_string
         else:
             return """
 No such option. Use "SELECT {schema_madlib}.cols2vec()" for help.
         """.format(schema_madlib=schema_madlib)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import plpy
	from control import MinWarning
	from internal.db_utils import is_col_1d_array
	from internal.db_utils import quote_literal
	from utilities import _assert
	from utilities import add_postfix
	from utilities import ANY_ARRAY
	from utilities import is_valid_psql_type
	from utilities import py_list_to_sql_string
	from utilities import split_quoted_delimited_str
	from validate_args import is_var_valid
	from validate_args import explicit_bool_to_text
	from validate_args import get_cols
	from validate_args import get_cols_and_types
	from validate_args import get_expr_type
	from validate_args import input_tbl_valid
	from validate_args import output_tbl_valid
	from validate_args import table_exists

	class vec_cols_helper:
	def __init__(self):
	self.all_cols = None

	def get_cols_as_list(self, cols_to_process, source_table=None, exclude_cols=None):
	"""
	Get a list of columns based on the value of cols_to_process
	Args:
	@param cols_to_process: str, Either a * or a comma-separated list of col names
	@param source_table: str, optional. Source table name
	@param exclude_cols: str, optional. Comma-separated list of the col(s) to exclude
	from the source table, only used if cols_to_process is *
	Returns:
	A list of column names (or an empty list)
	"""
	# If cols_to_process is empty/None, return empty list
	if not cols_to_process:
	return []
	if cols_to_process.strip() != "*":
	# If cols_to_process is a comma separated list of names, return list
	# of column names in cols_to_process.
	return [col for col in split_quoted_delimited_str(cols_to_process)
	if col not in split_quoted_delimited_str(exclude_cols)]
	if source_table:
	if not self.all_cols:
	self.all_cols = get_cols(source_table)
	return [col for col in self.all_cols
	if col not in split_quoted_delimited_str(exclude_cols)]
	return []

	class vec2cols:
	def __init__(self):
	self.get_cols_helper = vec_cols_helper()
	self.module_name = self.__class__.__name__

	def validate_args(self, source_table, output_table, vector_col, feature_names,
	cols_to_output):
	"""
	Validate args for vec2cols
	"""
	input_tbl_valid(source_table, self.module_name)
	output_tbl_valid(output_table, self.module_name)
	is_var_valid(source_table, cols_to_output)
	is_var_valid(source_table, vector_col)
	_assert(is_valid_psql_type(get_expr_type(vector_col, source_table), ANY_ARRAY),
	"{0}: vector_col should refer to an array.".format(self.module_name))
	_assert(is_col_1d_array(source_table, vector_col),
	"{0}: vector_col must be a 1-dimensional array.".format(self.module_name))

	def get_names_for_split_output_cols(self, source_table, vector_col):
	"""
	Get list of names for the newly-split columns to include in the
	output table.
	Args:
	@param: source_table, str. Source table
	@param: vector_col, str. Column name containing the array input
	@param: feature_names, list. Python list of the feature names to
	use for the split elements in the vector_col array
	"""
	query = """
	SELECT max(array_upper({0}, 1)) AS n_x
	FROM {1}
	""".format(vector_col, source_table)
	max_array_length = plpy.execute(query)[0]["n_x"]
	if not max_array_length:
	plpy.error('{0}: Column to split ({1}) must not be an empty array'
	.format(self.module_name, vector_col))
	# Create custom col names for output columns, with prefix "f".
	return ["f{0}".format(i+1) for i in range(max_array_length)]

	def validate_output_cols(self, features_to_unnest, cols_to_keep):
	# If there are more than 1600 columns for the output table, we give a
	# warning as it might give an error
	MAX_OUTPUT_COLUMN_COUNT = 1600
	_assert(len(features_to_unnest)+len(cols_to_keep) < MAX_OUTPUT_COLUMN_COUNT,
	"{0}: The output exceeds the max number of columns that " +
	"can be created ({1})".format(self.module_name, MAX_OUTPUT_COLUMN_COUNT))
	# Check if newly created col names have the same name as existing cols
	duplicate_col_names = set(features_to_unnest).intersection(set(cols_to_keep))
	_assert(len(duplicate_col_names) == 0,
	"{0}: Conflicting column names. Column names in source "
	"table cannot be {1}".format(self.module_name,
	list(duplicate_col_names)))

	def vec2cols(self, schema_madlib, source_table, output_table,
	vector_col, feature_names, cols_to_output, **kwargs):
	"""
	Split up a column of array entries into multiple columns, each column
	corresponding to one array position
	Args:
	@param: schema_madlib, str. The schema with madlib installed
	@param: source_table, str. The source table
	@param: output_table, str. The output table
	@param: vector_col, str. The column with array entries to split up
	@param: feature_names, list. Python list of the feature names to use
	for the split elements in the vector_col array
	@param: cols_to_output, str. Comma-separated list of the columns in
	the source_table to include in the output_table
	"""
	self.validate_args(source_table, output_table, vector_col, feature_names,
	cols_to_output)

	# Get names of columns to use for the split vector_col
	if not feature_names:
	features_to_unnest = self.get_names_for_split_output_cols(
	source_table, vector_col)
	else:
	features_to_unnest = feature_names

	# Get the columns to keep from the source table
	cols_to_keep = self.get_cols_helper.get_cols_as_list(cols_to_output,
	source_table)

	self.validate_output_cols(features_to_unnest, cols_to_keep)

	# Construct the output query and populate the output table with all the
	# correct parameters
	select_new_cols = (', '.join(["{0}[{1}] AS {2}".format(vector_col, i + 1, f)
	for i, f in enumerate(features_to_unnest)]))
	cols_from_src_table = ', '.join(cols_to_keep)+', ' if cols_to_keep else ''
	query = """
	CREATE TABLE {output_table} AS
	SELECT {cols_from_src_table} {select_new_cols}
	FROM {source_table}
	""".format(**locals())
	plpy.execute(query)

	def vec2cols_help_message(self, schema_madlib, message, **kwargs):
	"""
	Help message for vec2cols function
	"""
	summary_string = """
	-----------------------------------------------------------------------------------
	SUMMARY
	-----------------------------------------------------------------------------------
	Functionality: Vector to Columns

	The MADlib vec2cols function enables the user to split up a single column into
	multiple columns, given that the input column contains array entries. For example,
	if the input column contained ARRAY[1, 2, 3] in one of its rows, the output table
	will contain 3 different columns, one for each element of the array.

	For more details on function usage:
	SELECT {schema_madlib}.vec2cols('usage');

	For a small example on using the function:
	SELECT {schema_madlib}.vec2cols('example');
	""".format(schema_madlib=schema_madlib)

	usage_string = """
	-----------------------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------------------
	SELECT {schema_madlib}.vec2cols(
	'source_table', -- str, Name of the source table that contains the data
	'output_table', -- str, Name of the output view or table
	'vector_col', -- str, Name of the array entry column to be split
	'feature_names', -- array, Optional parameter to provide a text array of
	-- the feature names for the newly split columns (if not
	-- provided, default names f0, f1, ... will be used)
	'cols_to_output' -- str, Optional parameter to specify any other columns
	-- in the source_table to include in the output_table
	-- (default none of them, also supports '*' as input)
	""".format(schema_madlib=schema_madlib)

	example_string = """
	-----------------------------------------------------------------------------------
	EXAMPLE
	-----------------------------------------------------------------------------------
	-- Create an input data set:

	DROP TABLE IF EXISTS golf CASCADE;
	CREATE TABLE golf (
	id integer NOT NULL,
	"OUTLOOK" text,
	temperature double precision,
	humidity double precision,
	"Temp_Humidity" double precision[],
	clouds_airquality text[],
	windy boolean,
	class text,
	observation_weight double precision
	);
	INSERT INTO golf VALUES
	(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
	(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
	(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
	(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
	(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
	(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
	(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
	(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
	(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
	(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
	(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
	(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
	(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
	(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);

	-- Call the vec2cols function on the 'clouds_airquality' column, to split it up

	DROP TABLE IF EXISTS output_table;
	SELECT {schema_madlib}.vec2cols(
	'golf', -- source table
	'output_table', -- output table
	'clouds_airquality', -- column with array entries to split
	ARRAY['a', 'b'], -- feature_names array (will use 'a' to name the first new column, and 'b' for the second)
	'"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list)
	);

	SELECT * FROM output_table ORDER BY id;
	OUTLOOK \| id \| a \| b
	----------+----+--------+-----------
	sunny \| 1 \| none \| unhealthy
	sunny \| 2 \| none \| moderate
	overcast \| 3 \| low \| moderate
	rain \| 4 \| low \| moderate
	rain \| 5 \| medium \| good
	rain \| 6 \| low \| unhealthy
	overcast \| 7 \| medium \| moderate
	sunny \| 8 \| high \| unhealthy
	sunny \| 9 \| high \| good
	rain \| 10 \| medium \| good
	sunny \| 11 \| none \| good
	overcast \| 12 \| medium \| moderate
	overcast \| 13 \| medium \| moderate
	rain \| 14 \| low \| unhealthy
	(14 rows)
	""".format(schema_madlib=schema_madlib)

	if not message:
	return summary_string
	elif message.lower() in ('usage', 'help', '?'):
	return usage_string
	elif message.lower() in ('example', 'examples'):
	return example_string
	else:
	return """
	No such option. Use "SELECT {schema_madlib}.vec2cols()" for help.
	""".format(schema_madlib=schema_madlib)

	class cols2vec:
	def __init__(self):
	self.get_cols_helper = vec_cols_helper()
	self.module_name = self.__class__.__name__

	def validate_args(self, source_table, output_table,
	list_of_features, list_of_features_to_exclude, cols_to_output):
	"""
	Function to validate input parameters
	"""
	input_tbl_valid(source_table, self.module_name)
	output_tbl_valid(output_table, self.module_name)

	_assert(list_of_features and list_of_features.strip(), "{0}: List of "
	"features cannot be empty".format(self.module_name))
	if list_of_features.strip() != '*':
	is_var_valid(source_table, list_of_features)

	if list_of_features_to_exclude:
	if list_of_features_to_exclude.strip() == "*":
	plpy.error("{0}: Cannot exclude all columns from being "
	"features".format(self.module_name))
	elif list_of_features.strip() != '*':
	plpy.info("{0} NOTICE: will exclude given column(s) even though "
	"list of features was not *".format(self.module_name))

	is_var_valid(source_table, list_of_features_to_exclude)
	is_var_valid(source_table, cols_to_output)

	def get_and_validate_feature_types(self, source_table):
	"""
	This function validates the types of all of the features_to_nest,
	disallowing any features of type array. Lets underlying platform
	decide whether to allow 2 elements of different types be put into
	an array together (ex. integer and text features, if put together,
	will yield error by the underlying platform)
	"""
	all_cols_and_types = get_cols_and_types(source_table)
	distinct_types = set([col_type[1] for col_type in all_cols_and_types
	if col_type[0] in self.features_to_nest])
	_assert(not any(is_valid_psql_type(expr_type, ANY_ARRAY)
	for expr_type in distinct_types),
	"{0}: Feature columns to nest cannot be of type array"
	.format(self.module_name))

	if len(distinct_types) != 1 and 'boolean' in distinct_types:
	self.features_to_nest = explicit_bool_to_text(source_table, self.features_to_nest, self.schema_madlib)

	def cols2vec(self, schema_madlib, source_table, output_table, list_of_features,
	list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
	"""
	Args:
	@param schema_madlib: Name of MADlib schema
	@param source_table: Name of table containing input data
	@param output_table: Name of table to output the results
	@param list_of_features: Comma-separated string of column names or
	expressions to put into feature array.
	Can also be a '*' implying all columns
	are to be put into feature array
	@param list_of_features_to_exclude: Comma-separated string of column names
	to exclude from the feature array
	@param cols_to_output: Comma-separated string of column names
	from the source table to keep in the output table,
	in addition to the feature array
	"""
	self.schema_madlib = schema_madlib
	self.validate_args(source_table, output_table, list_of_features,
	list_of_features_to_exclude, cols_to_output)

	self.features_to_nest = self.get_cols_helper.get_cols_as_list(list_of_features,
	source_table, list_of_features_to_exclude)
	self.get_and_validate_feature_types(source_table)

	cols_to_keep = ', '.join(self.get_cols_helper.get_cols_as_list(cols_to_output,
	source_table))+", " if cols_to_output else ''

	feature_list_str = "ARRAY[ {val} ]".format(val=','.join(map(str, self.features_to_nest)))
	plpy.execute("""
	CREATE TABLE {output_table} AS
	SELECT {cols_to_keep}
	{feature_list_str} AS feature_vector
	FROM {source_table}
	""".format(**locals()))

	feature_cols = py_list_to_sql_string(
	[quote_literal(f) for f in self.features_to_nest], "TEXT", True)

	output_table_summary = add_postfix(output_table, "_summary")
	# Dollar-quote the text to allow single-quotes without escaping
	dq = "$__MADLIB_OUTER__$"
	plpy.execute("""
	CREATE TABLE {output_table_summary} AS
	SELECT
	{dq}{source_table}{dq}::TEXT AS source_table,
	{dq}{list_of_features}{dq}::TEXT AS list_of_features,
	{dq}{list_of_features_to_exclude}{dq}::TEXT AS list_of_features_to_exclude,
	{feature_cols} AS feature_names
	""".format(**locals()))

	def cols2vec_help_message(self, schema_madlib, message, **kwargs):
	"""
	Help message for cols2vec function
	"""
	summary_string = """
	-----------------------------------------------------------------------------------
	SUMMARY
	-----------------------------------------------------------------------------------
	Functionality: Columns to Vector

	The MADlib cols2vec function, given a table with a number of feature columns, will
	create an output table that contains the feature columns in an array. A summary table
	will also be created that contains the names of the features combined into array,
	so that this process can be reversed using the function vec2cols.

	For more details on function usage:
	SELECT {schema_madlib}.cols2vec('usage');

	For a small example on using the function:
	SELECT {schema_madlib}.cols2vec('example');
	""".format(schema_madlib=schema_madlib)

	usage_string = """
	-----------------------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------------------
	SELECT {schema_madlib}.cols2vec(
	'source_table', -- str, Name of the source table that contains the data
	'output_table', -- str, Name of the output view or table
	'list_of_features', -- Comma-separated string of column names or
	expressions to put into feature array.
	Can also be a '*' implying all columns
	are to be put into feature array
	'list_of_features_to_exclude', -- Comma-separated string of column names
	to exclude from the feature array
	'cols_to_output', -- Comma-separated string of column names
	from the source table to keep in the output table,
	in addition to the feature array
	""".format(schema_madlib=schema_madlib)

	example_string = """
	-----------------------------------------------------------------------------------
	EXAMPLE
	-----------------------------------------------------------------------------------
	-- Create an input data set:

	DROP TABLE IF EXISTS golf CASCADE;
	CREATE TABLE golf (
	id integer NOT NULL,
	"OUTLOOK" text,
	temperature double precision,
	humidity double precision,
	"Temp_Humidity" double precision[],
	clouds_airquality text[],
	windy boolean,
	class text,
	observation_weight double precision
	);
	INSERT INTO golf VALUES
	(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t Play', 5.0),
	(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t Play', 5.0),
	(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 'Play', 1.5),
	(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 1.0),
	(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
	(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0),
	(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
	(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 'Don''t Play', 5.0),
	(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 5.0),
	(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 1.0),
	(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 5.0),
	(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 'Play', 1.5),
	(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 'Play', 1.5),
	(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t Play', 1.0);

	-- Call the cols2vec function on the temperature and humidity columns, to nest them.
	-- Also include "windy" in features list, but exclude this feature using the exclude field

	DROP TABLE IF EXISTS output_table;
	SELECT {schema_madlib}.cols2vec(
	'golf', -- source table
	'output_table', -- output table
	'windy, temperature, humidity', -- feature list to nest
	'windy', -- features to exclude, from the above feature list
	'"OUTLOOK", id' -- columns to keep from source table (as a comma-separated list)
	);

	SELECT * FROM output_table ORDER BY id;
	OUTLOOK \| id \| feature_vector
	----------+----+----------------
	sunny \| 1 \| {{85,85}}
	sunny \| 2 \| {{80,90}}
	overcast \| 3 \| {{83,78}}
	rain \| 4 \| {{70,96}}
	rain \| 5 \| {{68,80}}
	rain \| 6 \| {{65,70}}
	overcast \| 7 \| {{64,65}}
	sunny \| 8 \| {{72,95}}
	sunny \| 9 \| {{69,70}}
	rain \| 10 \| {{75,80}}
	sunny \| 11 \| {{75,70}}
	overcast \| 12 \| {{72,90}}
	overcast \| 13 \| {{81,75}}
	rain \| 14 \| {{71,80}}
	(14 rows)
	""".format(schema_madlib=schema_madlib)

	if not message:
	return summary_string
	elif message.lower() in ('usage', 'help', '?'):
	return usage_string
	elif message.lower() in ('example', 'examples'):
	return example_string
	else:
	return """
	No such option. Use "SELECT {schema_madlib}.cols2vec()" for help.
	""".format(schema_madlib=schema_madlib)