src/ports/postgres/modules/pca/pca.py_in - madlib - Git at Google

 """
 @file pca.py_in

 @namespace pca
 """

 from convex.utils_regularization import utils_ind_var_scales
 from linalg.matrix_ops import get_dims
 from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims
 from linalg.matrix_ops import cast_dense_input_table_to_correct_columns
 from linalg.matrix_ops import validate_dense
 from linalg.matrix_ops import validate_sparse
 from linalg.svd import create_summary_table
 from linalg.svd import _svd_lower_wrap
 from linalg.svd import _svd_upper_wrap
 from utilities.utilities import _array_to_string
 from utilities.utilities import add_postfix
 from utilities.utilities import __mad_version
 from utilities.utilities import unique_string, split_quoted_delimited_str
 from utilities.utilities import _assert
 from utilities.validate_args import get_cols, get_cols_and_types
 from utilities.control import MinWarning
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists

 import time
 import plpy

 version_wrapper = __mad_version()
 string_to_array = version_wrapper.select_vecfunc()
 array_to_string = version_wrapper.select_vec_return()


 def pca_sparse(schema_madlib, source_table, pc_table, row_id,
                col_id, val_id, row_dim, col_dim, k, grouping_cols,
                lanczos_iter, use_correlation, result_summary_table,
                variance, **kwargs):
     """
     Args:
         @param schema_madlib
         @param source_table
         @param pc_table
         @param row_id
         @param col_id
         @param val_id
         @param row_dim
         @param col_dim
         @param k
         @param grouping_cols
         @param lanczos_iter
         @param use_correlation
         @param result_summary_table
         @param variance

     Returns:
         None

     """
     pca_wrap(schema_madlib, source_table, pc_table, row_id,
              k, grouping_cols, lanczos_iter, use_correlation,
              result_summary_table, variance, True, col_id,
              val_id, row_dim, col_dim)
 # ------------------------------------------------------------------------


 def pca(schema_madlib, source_table, pc_table, row_id,
         k, grouping_cols, lanczos_iter, use_correlation,
         result_summary_table, variance, **kwargs):
     """
     Args:
         @param schema_madlib
         @param source_table
         @param pc_table
         @param row_id
         @param k
         @param grouping_cols
         @param lanczos_iter
         @param use_correlation
         @param result_summary_table
         @param variance

     Returns:
         None

     """
     pca_wrap(schema_madlib, source_table, pc_table, row_id,
              k, grouping_cols, lanczos_iter, use_correlation,
              result_summary_table, variance)
 # ------------------------------------------------------------------------


 def pca_wrap(schema_madlib, source_table, pc_table, row_id,
              k, grouping_cols, lanczos_iter, use_correlation,
              result_summary_table, variance, is_sparse=False, col_id=None,
              val_id=None, row_dim=None, col_dim=None, **kwargs):
     """
     This wrapper was added to support grouping columns. This
     function does the necessary pre-processing for handling
     grouping_cols, if set. It then constructs a single query
     that includes a separate "madlib._pca_union(...)" for each
     group.
     """
     # Reset the message level to avoid random messages
     with MinWarning('warning'):
         grouping_cols_list = []
         if is_sparse:
             _validate_args(schema_madlib, source_table, pc_table, k, row_id, col_id,
                            val_id, row_dim, col_dim, lanczos_iter,
                            use_correlation, result_summary_table, variance)
         else:
             _validate_args(schema_madlib, source_table, pc_table, k,
                            row_id, None, None, None, None,
                            lanczos_iter, use_correlation,
                            result_summary_table, variance)
         if grouping_cols:
             # validate the grouping columns. We currently only support grouping_cols
             # to be column names in the source_table, and not expressions!
             grouping_cols_list = split_quoted_delimited_str(grouping_cols)
             _assert(columns_exist_in_table(source_table, grouping_cols_list, schema_madlib),
                     "PCA error: One or more grouping columns in {0} do not exist!".
                     format(grouping_cols))
             distinct_grouping_values = plpy.execute(
                 "SELECT DISTINCT {grouping_cols} FROM {source_table}".
                 format(grouping_cols=grouping_cols, source_table=source_table))
         else:
             grouping_cols = ''
         other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
         grouping_cols_clause = ''
         if grouping_cols_list:
             cols_names_types = get_cols_and_types(source_table)
             grouping_cols_clause = ', ' + ', '.join([c_name + " " + c_type for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
         # Create all output tables
         plpy.execute("""
             CREATE TABLE {pc_table} (
                 row_id               INTEGER,
                 principal_components double precision[],
                 std_dev              double precision,
                 proportion           double precision
                 {grouping_cols_clause}
             )
             """.format(pc_table=pc_table,
                        grouping_cols_clause=grouping_cols_clause))
         pc_table_mean = add_postfix(pc_table, "_mean")
         plpy.execute("DROP TABLE IF EXISTS {0}".format(pc_table_mean))
         plpy.execute("""
             CREATE TABLE {pc_table_mean} (
                 column_mean     double precision[]
                 {grouping_cols_clause}
             )
             """.format(pc_table_mean=pc_table_mean, grouping_cols_clause=grouping_cols_clause))
         if result_summary_table:
             plpy.execute("DROP TABLE IF EXISTS {0}".format(result_summary_table))
             plpy.execute("""
                 CREATE TABLE {0} (
                     rows_used               INTEGER,
                     "exec_time (ms)"        numeric,
                     iter                    INTEGER,
                     recon_error             double precision,
                     relative_recon_error    double precision,
                     use_correlation         boolean
                     {1}
                 )
                 """.format(result_summary_table, grouping_cols_clause))
         else:
             result_summary_table = ''

         # declare variables whose values will be different for each group, if
         # grouping_cols is specified
         grouping_where_clause = ''
         sparse_where_condition = ''
         select_grouping_cols = ''
         temp_table_columns = ''
         result_summary_table_temp = ''
         # For Dense matrix format only:
         # We can now ignore the original row_id for all computations since we will
         # create a new table with a row_id column that has not duplicates and ranges
         # from 1 to number of rows in the group/table. This is to mainly support the
         # grouping scneario where the row_id values might not range between 1 and
         # number of rows in the group, for each group. Doing this also just extends
         # this behavior for non-grouping scenarios too. If creating a new temp table
         # that corrects the row_id column is not of much importance in non-grouping
         # cases, we can avoid creating the temp table and save some computation time.
         # But, at the moment, the code creates the temp table even for the non-grouping
         # scenario.
         # We don't need to do this for sparse representation because of the nature
         # of its definition.
         other_columns_in_table.remove(row_id)
         temp_table_columns = (" ROW_NUMBER() OVER() AS row_id, " +
                               ','.join(other_columns_in_table))

         pca_union_call_list = []
         grp_id = 0
         if not is_sparse:
             col_id = 'NULL'
             val_id = 'NULL'
             row_dim = 0
             col_dim = 0
         while True:
             if result_summary_table:
                 result_summary_table_temp = ("pg_temp.{0}_{1}".
                                              format(unique_string(), grp_id))
             if grouping_cols:
                 grp_value_dict = distinct_grouping_values[grp_id]
                 where_conditions = ' AND '.join(
                     ["{0} = '{1}'".format(key, value)
                      for key, value in grp_value_dict.items()])
                 sparse_where_condition = ' AND ' + where_conditions
                 grouping_where_clause = ' WHERE ' + where_conditions
                 select_grouping_cols = (
                     ', ' + ', '.join(["'{1}' AS {0}".format(key, value)
                                       for key, value in grp_value_dict.items()]))

             pca_union_call_list.append("""
                 {schema_madlib}._pca_union(
                     '{source_table}',
                     '{pc_table}',
                     '{pc_table_mean}',
                     '{row_id}',
                     {k_null},
                     '{grouping_cols}',
                     {lanczos_iter},
                     {use_correlation},
                     '{result_summary_table}',
                     '{result_summary_table_temp}',
                     {variance_null},
                     {grp_id},
                     $${grouping_where_clause}$$,
                     $${sparse_where_condition}$$,
                     $${select_grouping_cols}$$,
                     '{temp_table_columns}',
                     {is_sparse},
                     '{col_id}',
                     '{val_id}',
                     {row_dim},
                     {col_dim})
                 """.format(k_null='NULL' if k is None else k,
                            variance_null='NULL' if variance is None else variance,
                            **locals()))
             grp_id += 1
             if not grouping_cols_list or len(distinct_grouping_values) == grp_id:
                 break
         # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
         # <query_i> in parallel.
         pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
         plpy.execute(pca_union_call)


 def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
                row_id, k, grouping_cols, lanczos_iter, use_correlation,
                result_summary_table, result_summary_table_temp, variance,
                grp_id, grouping_where_clause, sparse_where_condition,
                select_grouping_cols, temp_table_columns, is_sparse, col_id,
                val_id, row_dim, col_dim, **kwargs):
     """
     This function does all the heavy lifting of PCA, for both pca and pca_sparse.
     Compute the PCA of the matrix in source_table. This function is the specific
     call for dense matrices and creates three tables corresponding to the three
     decomposition matrices.

     Args:
         @param source_table          TEXT,    -- Source table name (dense matrix)
         @param pc_table              TEXT,    -- Output table name for the principal components
         @param pc_table_mean         TEXT,    -- Output table name for the principal components
         @param row_id                TEXT,    -- Column name for the ID for each row
         @param k                     INTEGER, -- Number of principal components to compute
         @param grouping_cols         TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
         @param lanczos_iter          INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension))
         @param use_correlation       BOOLEAN, -- If True correlation matrix is used for principal components (Default: False)
         @param result_summary_table  TEXT,    -- Table name to store summary of results (Default: NULL)
         @param result_summary_table_temp  TEXT,    -- Table name to store summary of results (Default: NULL)
         @param variance              DOUBLE PRECISION,   -- The proportion of variance (Default: NULL)
         @param grp_id                INTEGER, -- a place holder id for each group
         @param grouping_where_clause TEXT,    -- WHERE clause using grouping_cols
         @param select_grouping_cols  TEXT,    -- SELECT clause using grouping_cols
         @param temp_table_columns    TEXT,    -- SELECT caluse for creating temporary copy of the source_table
         @param is_sparse             BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
         @param col_id                TEXT,    -- sparse representation based detail
         @param val_id                TEXT,    -- sparse representation based detail
         @param row_dim               INTEGER, -- sparse representation based detail
         @param col_dim               INTEGER  -- sparse representation based detail

     Returns:
         None
     """
     startTime = time.time()  # measure the starting time
     # Step 1: Modify data format for sparse input
     if is_sparse:
         # Step 1.1: Densify the matrix for sparse input tables
         # We densify the matrix because the recentering process will generate a
         # dense matrix, so we just wrap around regular PCA.
         # First we must copy the sparse matrix and add in the dimension information
         sparse_temp = "pg_temp." + unique_string() + "_sparse"
         # Add in the dimension information needed by the densifying process
         create_temp_sparse_matrix_table_with_dims(source_table, sparse_temp,
                                                   row_id, col_id, val_id,
                                                   row_dim, col_dim, sparse_where_condition)
         validate_sparse(sparse_temp,
                         {'row': row_id, 'col': col_id, 'val': val_id},
                         check_col=False)
         # Step 1.2: Densify the input matrix
         x_dense = "pg_temp." + unique_string() + "_dense"
         plpy.execute("""
             SELECT {schema_madlib}.matrix_densify(
                 '{sparse_temp}',
                 'row={row_id}, col={col_id}, val={val_id}',
                 '{x_dense}', 'row=row_id, val=row_vec')
             """.format(schema_madlib=schema_madlib,
                        sparse_temp=sparse_temp, row_id=row_id,
                        col_id=col_id, val_id=val_id, x_dense=x_dense))
         plpy.execute("""
             DROP TABLE IF EXISTS {0};
             """.format(sparse_temp))
         source_table_grouped = x_dense
     else:
         # Creation of this temp table is unnecessary if the scenario does not involve
         # grouping, and/or, the input table had perfect values for the row_id column.
         # This temp table will ensure pca works even when the value of row_id column
         # in dense matrix format does not have values ranging from 1 to number of rows.
         source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
         plpy.execute("""
                     CREATE TABLE {source_table_grouped} AS
                     SELECT {temp_table_columns}
                     FROM {source_table}
                     {grouping_where_clause}
                 """.format(source_table_grouped=source_table_grouped,
                            source_table=source_table,
                            grouping_where_clause=grouping_where_clause,
                            temp_table_columns=temp_table_columns))
     row_id = 'row_id'
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string() + "_reformated_names"
     created_new_table = cast_dense_input_table_to_correct_columns(
         schema_madlib, source_table_grouped, source_table_copy, row_id)

     if(created_new_table):
         plpy.execute("DROP TABLE {0}".format(source_table_grouped))
         source_table_grouped = source_table_copy
     [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
     validate_dense(source_table_grouped,
                    {'row': 'row_id', 'val': 'row_vec'},
                    check_col=False, row_dim=row_dim)
     if k:
         if k > min(row_dim, col_dim):
             plpy.error(
                 "PCA error: k cannot be larger than min(row_dim, col_dim)!")
         curK = k
     else:
         curK = min(row_dim, col_dim)
     # If using the default number of lanczos iterations, set to the default
     if lanczos_iter == 0:
         if k:
             lanczos_iter = min(k + 40, min(col_dim, row_dim))
         else:
             lanczos_iter = min(col_dim, row_dim)
     else:
         if variance:  # lanczos_iter overrides the proportion default for k
             curK = lanczos_iter
     # Note: we currently don't support grouping columns or correlation matrices
     if not use_correlation:
         # Step 2: Normalize the data (Column means)
         dimension = col_dim
         scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
         column_mean_str = _recenter_data(schema_madlib,
                                          source_table_grouped,
                                          scaled_source_table,
                                          'row_id',
                                          'row_vec',
                                          dimension)
         # Step 3: Create temporary output & result summary table
         svd_output_temp_table = "pg_temp." + unique_string() + "_svd_out_tbl"

         if result_summary_table_temp is None:
             result_summary_table_string = ''
         else:
             result_summary_table_string = ", '{0}'".format(result_summary_table_temp)
         # Step 4: Perform SVD
         # Step 4.1: Perform upper part of SVD
         if result_summary_table_temp:
             t0 = time.time()

         (source_table_svd, bd_pref) = _svd_upper_wrap(
             schema_madlib, scaled_source_table, svd_output_temp_table,
             row_id, curK, lanczos_iter, result_summary_table_temp)

         # Calculate the sum of values for proportion
         svd_var_s = add_postfix(svd_output_temp_table, "_s")
         eigen_sum = plpy.execute(
             """
             SELECT {schema_madlib}.array_sum(
                 {schema_madlib}.sum(
                     {schema_madlib}.array_square(row_vec)
                 )
             )
             FROM {scaled_source_table}
             """.format(**locals()))[0]['array_sum']
         # Step 4.2: Adjust the k value
         if variance:
             variance_tmp_table = "pg_temp." + unique_string() + "_var_tmp"
             plpy.execute(
                 """
                 CREATE TABLE {variance_tmp_table} AS (
                     SELECT row_id,
                            col_id,
                            sum(value*value) OVER(ORDER BY value DESC)
                             AS sum_value_square
                     FROM {svd_var_s})
                 """.format(variance_tmp_table=variance_tmp_table,
                            svd_var_s=svd_var_s))
             ecount = plpy.execute("""
                 SELECT count(sum_value_square)
                 FROM {variance_tmp_table}
                 WHERE sum_value_square / {eigen_sum} < {variance}
                 """.format(variance_tmp_table=variance_tmp_table,
                            eigen_sum=eigen_sum,
                            variance=variance))
             curK = ecount[0]['count']
             # The next value is the first that is over the treshold
             if (lanczos_iter > curK):
                 curK = curK + 1
             plpy.execute("""
                 DROP TABLE IF EXISTS {variance_tmp_table}
                 """.format(variance_tmp_table=variance_tmp_table))
         # Step 4.3: Perform the lower part of SVD
         tmp_matrix_table = "temp_" + unique_string() + "_matrix"
         tmp_matrix_s_table = add_postfix(tmp_matrix_table, "_s")

         if variance:  # remove elements after the variance threshold
             plpy.execute("""
                 DELETE FROM {svd_var_s}
                 WHERE row_id >{curK};
                 ALTER TABLE {svd_var_s} RENAME TO {tmp_matrix_s_table};
                 """.format(**locals()))
             _svd_lower_wrap(schema_madlib, source_table_svd,
                             svd_output_temp_table, row_id, curK, lanczos_iter, bd_pref,
                             tmp_matrix_s_table)
         else:
             tmp_matrix_table = svd_output_temp_table
             _svd_lower_wrap(schema_madlib, source_table_svd,
                             svd_output_temp_table, row_id, curK, lanczos_iter, bd_pref)
         # Step 4.4: Create the SVD result table
         if result_summary_table_temp:
             t1 = time.time()
             [row_dim, col_dim] = get_dims(source_table_grouped,
                                           {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'})
             arguments = {'schema_madlib': schema_madlib,
                          'source_table': scaled_source_table,
                          'matrix_u': add_postfix(svd_output_temp_table, "_u"),
                          'matrix_v': add_postfix(svd_output_temp_table, "_v"),
                          'matrix_s': add_postfix(tmp_matrix_table, "_s"),
                          'row_dim': row_dim,
                          'col_dim': col_dim,
                          'result_summary_table': result_summary_table_temp,
                          'temp_prefix': "pg_temp." + unique_string(),
                          't0': t0, 't1': t1}
             create_summary_table(**arguments)
         # Step 5: Transpose SVD output matrix
         svd_v_transpose = "pg_temp." + unique_string() + "transpose"

         svd_output_temp_table_s = add_postfix(tmp_matrix_table, "_s")
         svd_output_temp_table_v = add_postfix(svd_output_temp_table, "_v")
         svd_output_temp_table_u = add_postfix(svd_output_temp_table, "_u")
         plpy.execute(
             """
             SELECT  {schema_madlib}.matrix_trans(
                     '{svd_output_temp_table_v}',
                     'row=row_id, col=col_id, val=row_vec',
                     '{svd_v_transpose}', 'row=row_id, col=col_id, val=row_vec');
             """.format(svd_output_temp_table_v=svd_output_temp_table_v,
                        svd_v_transpose=svd_v_transpose,
                        schema_madlib=schema_madlib)
         )
         # Step 6: Insert the output of SVD into the PCA table
         plpy.execute(
             """
             INSERT INTO {pc_table}
             SELECT  {svd_v_transpose}.row_id,
                     row_vec AS principal_components,
                     value / sqrt({row_dim} - 1) AS std_dev,
                     ((value*value)/ {eigen_sum}) AS proportion
                     {select_grouping_cols}
             FROM {svd_v_transpose},
                  {svd_output_temp_table_s}
             WHERE ({svd_v_transpose}.row_id = {svd_output_temp_table_s}.row_id)
                   AND ({svd_v_transpose}.row_id <= {k})
                   AND value is not NULL
             """.format(svd_output_temp_table_s=svd_output_temp_table_s,
                        k=curK,
                        svd_v_transpose=svd_v_transpose,
                        pc_table=pc_table,
                        row_dim=row_dim,
                        eigen_sum=eigen_sum,
                        select_grouping_cols=select_grouping_cols))
         # Output the column mean
         plpy.execute(
             """
             INSERT INTO {pc_table_mean}
             SELECT '{column_mean_str}'::FLOAT8[] AS column_mean
             {select_grouping_cols}
             """.format(pc_table_mean=pc_table_mean,
                        column_mean_str=column_mean_str,
                        select_grouping_cols=select_grouping_cols))
         # Step 7: Append to the SVD summary table to get the PCA summary table
         if result_summary_table_temp:
             stopTime = time.time()
             dt = (stopTime - startTime) * 1000.
             plpy.execute(
                 """
                 INSERT INTO {result_summary_table}
                 SELECT
                     rows_used,
                     {dt} AS "exec_time (ms)",
                     {iter} AS iter,
                     recon_error,
                     relative_recon_error,
                     {use_correlation} AS use_correlation
                     {select_grouping_cols}
                 FROM {result_summary_table_temp};
                 """.format(result_summary_table=result_summary_table,
                            dt=str(dt), iter=curK,
                            use_correlation=bool(use_correlation),
                            result_summary_table_temp=result_summary_table_temp,
                            select_grouping_cols=select_grouping_cols))
             plpy.execute("DROP TABLE {result_summary_table_temp};".format(
                 result_summary_table_temp=result_summary_table_temp))

         # Step 8: Output handling & cleanup
         plpy.execute(
             """
             DROP TABLE IF EXISTS {tmp_matrix_s_table};
             DROP TABLE IF EXISTS {svd_v_transpose};
             DROP TABLE IF EXISTS {source_table_copy};
             DROP TABLE IF EXISTS {svd_output_temp_table};
             DROP TABLE IF EXISTS {svd_output_temp_table_s};
             DROP TABLE IF EXISTS {svd_output_temp_table_u};
             DROP TABLE IF EXISTS {svd_output_temp_table_v};
             DROP TABLE IF EXISTS {scaled_source_table};
             DROP TABLE IF EXISTS {source_table_grouped};
             """.format(svd_output_temp_table=svd_output_temp_table,
                        svd_output_temp_table_s=svd_output_temp_table_s,
                        svd_output_temp_table_u=svd_output_temp_table_u,
                        svd_output_temp_table_v=svd_output_temp_table_v,
                        scaled_source_table=scaled_source_table,
                        svd_v_transpose=svd_v_transpose,
                        source_table_copy=source_table_copy,
                        tmp_matrix_s_table=tmp_matrix_s_table,
                        source_table_grouped=source_table_grouped))
 # ------------------------------------------------------------------------

 # ------------------------------------------------------------------------
 # Validate arguments: Same as pca
 # ------------------------------------------------------------------------


 def _validate_args(schema_madlib,
                    source_table,
                    pc_table,
                    k,
                    row_id,
                    col_id=None,
                    val_id=None,
                    row_dim=None,
                    col_dim=None,
                    lanczos_iter=0,
                    use_correlation=False,
                    result_summary_table=None,
                    variance=None):
     """
     Validates all arguments passed to the PCA function
     Args:
         @param schema_madlib    Name of the schema where MADlib is installed
         @param source_table     Name of the source table
         @param output_table     Name of the output table
         @param k                Number of singular vectors to return
         @param row_id           Name of the row_id column
         @param col_id           Name of the col_id column
         @param val_id           Name of the val_id column
         @param grouping_cols    The columns that the data should be grouped by
         @param lanczos_iter     The number of lanczos iterations to use in the SVD calculation
         @param use_correlation  If the correlation matrix should be used instead of the covariance matrix
         @param result_summary_table  Name of summary table
         @param variance         Proportion of variance

     Returns:
         None

     Throws:
         plpy.error if any argument is invalid

     """
     _assert(source_table is not None and table_exists(source_table),
             "PCA error: Source data table {0} does not exist!".
             format(str(source_table)))
     if not k and not variance:
         plpy.error("""PCA error: components_param must be either
             a positive integer or a float in the range (0.0,1.0]!""")
     if k:
         if k <= 0:
             plpy.error("""PCA error: components_param must be either
                 a positive integer or a float in the range (0.0,1.0]!""")
     if variance:
         if (variance <= 0) or (variance > 1):
             plpy.error("""PCA error: components_param must be either
                 a positive integer or a float in the range (0.0,1.0]!""")
     # confirm output tables are valid
     if pc_table:
         _assert(not table_exists(pc_table, only_first_schema=True) and
                 not table_exists(pc_table + '_mean', only_first_schema=True),
                 "PCA error: Output table {pc_table}/{pc_table}_mean "
                 "already exist!".format(pc_table=pc_table))
     else:
         plpy.error("PCA error: Invalid output table prefix!")

     _assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
             "PCA error: {1} column does not exist in {0}!".
             format(source_table, "NULL" if row_id is None else row_id))

     if (lanczos_iter < 0):
         plpy.error("PCA error: lanczos_iter can't be negative! (Use zero for \
         default value)  The provided value is {0}".format(str(lanczos_iter)))

     # If using sparse matrices, check that the parameters are reasonable
     if col_id or val_id:
         if not col_id:
             plpy.error("PCA error: Column ID should be provided if \
                             value ID is input!")
         if not val_id:
             plpy.error("PCA error: Value ID should be provided if \
                             column ID is input!")

         _assert(columns_exist_in_table(source_table, [col_id], schema_madlib),
                 "PCA error: {1} column does not exist in {0}!".
                 format(source_table, col_id))
         _assert(columns_exist_in_table(source_table, [val_id], schema_madlib),
                 "PCA error: {1} column does not exist in {0}!".
                 format(source_table, val_id))

         if not col_dim:
             plpy.error("PCA error: Column dimensions should be provided if \
                             using sparse matrices!")
         if not row_dim:
             plpy.error("PCA error: Row dimensions should be provided if \
                             using sparse matrices!")
         if row_dim <= 0:
             plpy.error("PCA error: The row dimension must be larger than 0!")

         if col_dim <= 0:
             plpy.error("PCA error: The column dimension must be larger than 0!")

     if use_correlation:
         plpy.error("PCA error: Using the correlation matrix is not enabled! \
         This value must be set to FALSE")

     if result_summary_table:
         if not result_summary_table.strip():
             plpy.error("PCA error: Invalid result summary table name!")
         _assert(not table_exists(result_summary_table, only_first_schema=True),
                 "PCA error: Result summary table {0} \
                         already exists!".format(result_summary_table))
 # ========================================================================


 def _recenter_data(schema_madlib, source_table, output_table, row_id,
                    col_name, dimension):
     """
     Rescales the data table by column means.

     The output is stored in output_table. The input table should have a
     array column that contains the data

     Args:
         @param schema_madlib Name of the schema where MADlib is installed
         @param source_table  Name of the source table
         @param output_table  Name of the output table
         @param row_id        Name of the row_id column
         @param col_name      Name of the array column from input table
         @param dimension

     Returns:
         Column mean
     """
     # Step 1: Compute column mean values
     x_scales = utils_ind_var_scales(tbl_data=source_table,
                                     col_ind_var=col_name,
                                     dimension=dimension,
                                     schema_madlib=schema_madlib)

     x_mean_str = _array_to_string(x_scales["mean"])
     x_std_str = _array_to_string([1] * dimension)

     # Step 2: Rescale the matrices
     plpy.execute(
         """
         CREATE TABLE {output_table}
         m4_ifdef(`__POSTGRESQL__', `', `WITH (appendonly=true)')
         AS
             SELECT
                 {row_id} as row_id,
                 ({schema_madlib}.utils_normalize_data(
                                   {col_name},
                                   '{x_mean_str}'::double precision[],
                                   '{x_std_str}'::double precision[]))
                     AS row_vec
             FROM {source_table}
         """.format(schema_madlib=schema_madlib,
                    col_name=col_name,
                    row_id=row_id,
                    source_table=source_table,
                    output_table=output_table,
                    x_mean_str=x_mean_str,
                    x_std_str=x_std_str))

     return x_mean_str
 # ------------------------------------------------------------------------

 # Sparse PCA train help function


 def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
     """
     Given a help string, provide usage information

     Args:
         @param schema_madlib Name of the MADlib schema
         @param message  Helper message to print

     Returns:
         None
     """
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
 SELECT {schema_madlib}.pca_sparse_train(
     source_table        -- TEXT,    Name of data table
     pc_table            -- TEXT,    Name of the table containing the principal components
     row_id              -- TEXT,    Column name for the row coordinates.
     col_id              -- TEXT,    Column name for the column coordinates.
     val_id              -- TEXT,    Column name for the sparse values.
     row_dim,            -- INTEGER, The number of rows in the sparse matrix
     col_dim,            -- INTEGER, The number of columns in the sparse matrix
     components_param    -- INTEGER OR FLOAT, The parameter to control the number of
                                     principal components to calculate from the input data.
     grouping_cols       -- TEXT,    Comma-separated list of grouping columns
                                     (Default: NULL)
     lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
                                     (Default: minimum of of the smallest input
                                         matrix dimension and k+40)
     use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
                                     (Default: False)
     rslt_summary_table  -- TEXT,    Table name to store summary of results
                                     (Default: NULL)
 );
 If components_param is INTEGER it is used for denoting the number of principal components to compute.
 If components_param is FLOAT it is used as the target proportion of variance.
 -------------------------------------------------------------------------
                         OUTPUT TABLES
 -------------------------------------------------------------------------
 A PCA model is created for each group, if grouping_cols is specified.
 The output table ("pc_table" above) has the following columns:
     row_id              -- INTEGER, The ranking of the eigenvalues
     prin_comp           -- FLOAT[], The principal components
     eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
     grouping_cols       -- The grouping columns (with their types), if any,
                            specified in grouping_cols

 A secondary output table named "pc_table"_mean is also generated.
 This table has only the single column:
     column_mean         -- FLOAT[], The column means of the input data

 -------------------------------------------------------------------------
                     RESULT SUMMARY TABLE
 -------------------------------------------------------------------------
 The result summary table ("rslt_summary_table" above) has the following columns
     rows_used              -- INTEGER,  Number of rows used in the PCA calculation
     exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
     use_correlation        -- BOOLEAN,  Value of parameter use_correlation
     iter                   -- INTEGER,  Number of iterations the SVD took to converge
     recon_error            -- FLOAT,    Absolute error in the approximation
     relative_recon_error   -- FLOAT     Relative error in the approximation
     grouping_cols          -- The grouping columns (with their types), if any,
                            specified in grouping_cols
         """.format(schema_madlib=schema_madlib)
     else:
         return """
 ----------------------------------------------------------------
          Summary: Sparse PCA Training
 ----------------------------------------------------------------
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
 called principal components. This transformation is defined in such a way that
 the first principal component has the largest possible variance (i.e.,
 accounts for as much of the variability in the data as possible), and each
 succeeding component in turn has the highest variance possible under the
 constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
 --
 For an overview on usage, run:
 SELECT {schema_madlib}.pca_sparse_train('usage');
 --
         """.format(schema_madlib=schema_madlib)


 def pca_help_message(schema_madlib, message=None, **kwargs):
     """
     Given a help string, provide usage information

     Args:
         @param schema_madlib Name of the MADlib schema
         @param message  Helper message to print

     Returns:
         None
     """
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
 SELECT {schema_madlib}.pca_train(
     source_table        -- TEXT,    Name of data table
     pc_table            -- TEXT,    Name of the table containing the principal components
     row_id              -- TEXT,    Column name for the row coordinates.
     components_param    -- INTEGER OR FLOAT, The parameter to control the number of
                                              principal components to calculate from
                                              the input data.

     grouping_cols       -- TEXT,    Comma-separated list of grouping column names
                                     (Default: NULL)
     lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
                                     (Default: minimum of of the smallest input
                                         matrix dimension and k+40)
     use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
                                     (Default: False)
     rslt_summary_table  -- TEXT,    Table name to store summary of results
                                     (Default: NULL)
     variance            -- DOUBLE PRECISION, Proportion of variance
                                     (Default: NULL)
 );
 If components_param is INTEGER it is used for denoting the number of
 principal components to compute. If components_param is FLOAT it is used
 as the target proportion of variance.
 -------------------------------------------------------------------------
                         OUTPUT TABLES
 -------------------------------------------------------------------------
 A PCA model is created for each group, if grouping_cols is specified.
 The output table ("pc_table" above) has the following columns:
     row_id              -- INTEGER, The ranking of the eigenvalues
     prin_comp           -- FLOAT[], The principal components
     eigen_values        -- FLOAT[], The eigenvalues associated with each
                                     principal component
     grouping_cols       -- The grouping columns (with their types), if any,
                            specified in grouping_cols

 A secondary output table named "pc_table"_mean is also generated.
 This table has only the single column:
     column_mean         -- FLOAT[], The column means of the input data
     grouping_cols       -- The grouping columns (with their types), if any,
                            specified in grouping_cols
 -------------------------------------------------------------------------
                     RESULT SUMMARY TABLE
 -------------------------------------------------------------------------
 The result summary table ("rslt_summary_table" above) has the following columns
     rows_used              -- INTEGER,  Number of rows used in the PCA calculation
     exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
     use_correlation        -- BOOLEAN,  Value of parameter use_correlation
     iter                   -- INTEGER,  Number of iterations the SVD took to converge
     recon_error            -- FLOAT,    Absolute error in the approximation
     relative_recon_error   -- FLOAT     Relative error in the approximation
     grouping_cols          -- The grouping columns (with their types), if any,
                               specified in grouping_cols
         """.format(schema_madlib=schema_madlib)

     else:
         return """
 ----------------------------------------------------------------
          Summary: PCA Training
 ----------------------------------------------------------------
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
 called principal components. This transformation is defined in such a way that
 the first principal component has the largest possible variance (i.e.,
 accounts for as much of the variability in the data as possible), and each
 succeeding component in turn has the highest variance possible under the
 constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
 --
 For an overview on usage, run:
 SELECT {schema_madlib}.pca_train('usage');
 --
             """.format(schema_madlib=schema_madlib)