src/ports/postgres/modules/pca/pca_project.py_in - madlib - Git at Google

 """
 @file pca_project.py_in

 @namespace pca
 """

 import plpy
 import time

 from linalg.matrix_ops import cast_dense_input_table_to_correct_columns
 from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims
 from linalg.matrix_ops import get_dims
 from linalg.matrix_ops import validate_dense
 from linalg.matrix_ops import validate_sparse
 from utilities.utilities import __mad_version
 from utilities.utilities import unique_string
 from utilities.utilities import _assert
 from utilities.utilities import _array_to_string
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
 from utilities.utilities import add_postfix


 version_wrapper = __mad_version()
 string_to_array = version_wrapper.select_vecfunc()
 array_to_string = version_wrapper.select_vec_return()
 ZERO_THRESHOLD = 1e-6


 # Dense PCA help function
 def pca_project_help(schema_madlib, usage_string=None, **kwargs):
     """
     Given a usage string, give out function usage information.
     """
     if usage_string is None:
         usage_string = ''

     if (usage_string.lower() == "usage"):
         return """
         ----------------------------------------------------------------
                                 Usage
         ----------------------------------------------------------------
         SELECT {schema_madlib}.pca_project (
             'tbl_source',          -- Data table
             'tbl_pc',              -- Table with principal componenents
                                         (obtained as output from pca_train)
             'tbl_result',          -- Result table
             'row_id',              -- Name of the column containing the row_id
             -- Optional Parameters
             ----------------------------------------------------------------
             'tbl_residual',        -- Residual table (Default: NULL)
             'tbl_result_summary',  -- Result summary table (Default : NULL)
         );

                                 Output Tables
         --------------------------------------------------------------------
         The output is divided into three tables (two of which are optional)

         --------------------------------------------------------------------
          The output table ('tbl_result' above) encodes a dense matrix
          with the projection onto the principal components. The matrix contains
          the following columns:

          'row_id'        INTEGER,            -- Row id of the output matrix
          'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

         --------------------------------------------------------------------
          The residual table ('tbl_residual' above) encodes a dense residual
          matrix which has the following columns

          'row_id'        INTEGER,            -- Row id of the output matrix
          'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

         --------------------------------------------------------------------
          The result summary table ('tbl_result_summary' above) has the following columns

           'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
           'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
           'relative_residual_norm'   DOUBLE PRECISION    -- Relative error of the residuals
         ----------------------------------------------------------------
         """.format(schema_madlib=schema_madlib)
     else:
         return """
         ----------------------------------------------------------------
                  Summary: PCA Projection
         ----------------------------------------------------------------
         PCA Projection: Projects a dataset to an already trained
         space of principal components.
         --
         For function usage information, run
         SELECT {schema_madlib}.pca_project('usage');
         --
         """.format(schema_madlib=schema_madlib)


 # Sparse PCA help function
 # ------------------------------------------------------------------------
 def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs):
     """
     Given a usage string, give out function usage information.
     """
     if usage_string is None:
         usage_string = ''

     if (usage_string.lower() == "usage"):
         return """
         ----------------------------------------------------------------
                                 Usage
         ----------------------------------------------------------------
         SELECT {schema_madlib}.pca_sparse_project (
             'tbl_source',          -- Data table
             'tbl_pc',              -- Table with principal componenents
                                         (obtained as output from pca_train)
             'tbl_result',          -- Result table
             'row_id',              -- Name of the column containing the row_id
             'col_id',              -- Name of the column containing the col_id
             'val_id',              -- Name of the column containing the val_id
             'row_dim'              -- Row dimension of the sparse matrix
             'col_dim'              -- Column dimension of the sparse matrix
             -- Optional Parameters
             ----------------------------------------------------------------
             'tbl_residual',        -- Residual table (Default: NULL)
             'tbl_result_summary',  -- Result summary table (Default : NULL)
         );

                                 Output Tables
         ----------------------------------------------------------------
         The output is divided into three tables (two of which are optional)

         -----------------------------------------------------------------------------------------
          The output table ('tbl_result' above) encodes a dense matrix
          with the projection onto the principal components. The matrix contains
          the following columns:

          'row_id'        INTEGER,            -- Row id of the output matrix
          'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

         -----------------------------------------------------------------------------------------
          The residual table ('tbl_residual' above) encodes a dense residual
          matrix which has the following columns

          'row_id'        INTEGER,            -- Row id of the output matrix
          'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

         -----------------------------------------------------------------------------------------
          The result summary table ('tbl_result_summary' above) has the following columns

           'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
           'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
           'relative_residual_norm'   DOUBLE PRECISION    -- Relative error of the residuals
         ----------------------------------------------------------------
         """.format(schema_madlib=schema_madlib)
     else:
         return """
         ----------------------------------------------------------------
                  Summary: PCA Projection
         ----------------------------------------------------------------
         PCA Projection: Projects a dataset to an already trained
         space of principal components.
         --
         For function usage information, run
         SELECT {schema_madlib}.pca_sparse_project('usage');
         --
         """.format(schema_madlib=schema_madlib)


 # Validate arguments: Same as pca
 # ------------------------------------------------------------------------
 def _validate_args(schema_madlib,
                    source_table,
                    pc_table,
                    out_table,
                    row_id,
                    col_id=None,
                    val_id=None,
                    row_dim=None,
                    col_dim=None,
                    residual_table=None,
                    result_summary_table=None):
     """
     Validates all arguments passed to the PCA function

     Args:
         @param schema_madlib  Name of MADlib schema
         @param source_table  Name of the input table (containing data to project)
         @param pc_table   Name of table with principal components (output by the training function)
         @param out_table  Name of output table to store projection result
         @param row_id  Name of the row_id column
         @param col_id  Name of the col_id column (only for sparse matrices)
         @param val_id  Name of the val_id column (only for sparse matrices)
         @param row_dim  Number of rows in input matrix (only for sparse matrices)
         @param col_dim  Number of columns in input matrix (only for sparse matrices)
         @param residual_table  Name of the residual table (to store error in projection)
         @param result_summary_table  Name of result summary table
     Returns:
         None
     Throws:
         plpy.error if any argument is invalid
     """
     _assert(source_table is not None and table_exists(source_table),
             "PCA error: Source data table does not exist!")

     _assert(pc_table is not None and table_exists(pc_table),
             "PCA error: Principal comp. table does not exist!")

     _assert(table_exists(add_postfix(pc_table, "_mean")),
             "PCA error: Source data table column means does not exist!")

     # Make sure that the output table does not exist
     # Also check that the output table is not null
     _assert(out_table and out_table.strip(),
             "PCA error: Invalid output table name.")
     _assert(not table_exists(out_table, only_first_schema=True),
             "PCA error: Output table {0} already exists!".format(str(out_table)))

     # Check that the result summary table is not empty
     if result_summary_table is not None:
         _assert(result_summary_table.strip(),
                 "PCA error: Invalid result summary table name!")
         _assert(not table_exists(result_summary_table, only_first_schema=True),
                 "PCA error: Result summary table {0} already exists!".
                 format(result_summary_table))

     # Check that the result summary table is not empty
     if residual_table is not None:
         _assert(residual_table.strip(),
                 "PCA error: Invalid residual table name!")
         _assert(not table_exists(residual_table, only_first_schema=True),
                 "PCA error: Residual table {0} already exists!".
                 format(residual_table))

     # Check that the row_id exists
     _assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
             "PCA error: {1} column does not exist in {0}!".
             format(source_table, "NULL" if row_id is None else row_id))

     # For sparse inputs: Check that the row_id exists
     if col_id or val_id:
         _assert(col_id,
                 "PCA error: Column ID should be provided if value ID is input!")
         _assert(val_id,
                 "PCA error: Value ID should be provided if column ID is input!")
         _assert(columns_exist_in_table(source_table, [col_id], schema_madlib),
                 "PCA error: {1} column does not exist in {0}!".
                 format(source_table, col_id))
         _assert(columns_exist_in_table(source_table, [val_id], schema_madlib),
                 "PCA error: {1} column does not exist in {0}!".
                 format(source_table, val_id))
         _assert(row_dim > 0 and col_dim > 0,
                 "PCA error: row_dim/col_dim should be positive integer")

         validate_sparse(source_table,
                         {'row': row_id, 'col': col_id, 'val': val_id},
                         checkCol=False)
 # ------------------------------------------------------------------------


 def pca_sparse_project(schema_madlib,
                        source_table,
                        pc_table,
                        out_table,
                        row_id,
                        col_id,
                        val_id,
                        row_dim,
                        col_dim,
                        residual_table,
                        result_summary_table,
                        **kwargs):
     """
     PCA projection of the matrix in source_table.

     This function is the specific call for pca projection. It projects
     the input matrix into the principal components.

     Args:
         @param schema_madlib  Name of MADlib schema
         @param source_table  Name of the input table (containing data to project)
         @param pc_table   Name of table with principal components (output by the training function)
         @param out_table  Name of output table to store projection result
         @param row_id  Name of the row_id column
         @param col_id  Name of the col_id column
         @param val_id  Name of the val_id column
         @param row_dim  Number of rows in input matrix
         @param col_dim  Number of columns in input matrix
         @param residual_table  Name of the residual table (to store error in projection)
         @param result_summary_table  Name of result summary table
     Returns:
         None
     Throws:
         plpy.error if any argument is invalid
     """

     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
                                   FROM pg_settings
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')

     # Step 1: Validate the input arguments
     _validate_args(schema_madlib,
                    source_table,
                    pc_table,
                    out_table,
                    row_id,
                    col_id,
                    val_id,
                    row_dim,
                    col_dim,
                    residual_table,
                    result_summary_table)

     # Step 2: Create a copy of the sparse matrix and add row_dims and col_dims
     # Warning: This changes the column names of the table
     sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
     create_temp_sparse_matrix_table_with_dims(source_table,
                                               sparse_table_copy,
                                               row_id,
                                               col_id,
                                               val_id,
                                               row_dim,
                                               col_dim)

     # Step 3: Densify the input matrix
     x_dense = "pg_temp." + unique_string() + "_dense"
     plpy.execute("""
         SELECT {schema_madlib}.matrix_densify(
             '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
             '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
         """.format(**locals()))

     # Step 4: Pass the densified matrix to regular PCA
     pca_project(schema_madlib,
                 x_dense,
                 pc_table,
                 out_table,
                 'row_id',
                 residual_table,
                 result_summary_table)

     # Step 4: Clean up
     plpy.execute(
         """
         DROP TABLE IF EXISTS {x_dense};
         DROP TABLE IF EXISTS {sparse_table_copy};
         """.format(x_dense=x_dense,
                    sparse_table_copy=sparse_table_copy))

     plpy.execute("SET client_min_messages TO %s" % old_msg_level)


 # ------------------------------------------------------------------------
 def pca_project(schema_madlib,
                 source_table,
                 pc_table,
                 out_table,
                 row_id,
                 residual_table,
                 result_summary_table,
                 **kwargs):
     """
     PCA projection of the matrix in source_table.

     This function is the specific call for pca projection. It projects
     the input matrix into the principal components.

     Args:
         @param schema_madlib  Name of MADlib schema
         @param source_table  Name of the input table (containing data to project)
         @param pc_table   Name of table with principal components (output by the training function)
         @param out_table  Name of output table to store projection result
         @param row_id  Name of the row_id column
         @param residual_table  Name of the residual table (to store error in projection)
         @param result_summary_table  Name of result summary table
     Returns:
         None
     Throws:
         plpy.error if any argument is invalid
     """
     t0 = time.time()  # measure the starting time

     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
                                   FROM pg_settings
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')

     # Step 1: Validate the input arguments
     _validate_args(schema_madlib, source_table, pc_table, out_table,
                    row_id, None, None, None, None,
                    residual_table, result_summary_table)

     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string()
     need_new_column_names = cast_dense_input_table_to_correct_columns(
         schema_madlib, source_table, source_table_copy, row_id)

     if(need_new_column_names):
         source_table = source_table_copy

     [row_dim, col_dim] = get_dims(source_table,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
     validate_dense(source_table,
                    {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'},
                    checkCol=False, row_dim=row_dim)

     # Step 2: Compute the PCA Projection matrix
     # The R code to perform this step is
     #  p <- princomp(mat)
     #  low_rank_representation <- mat %*% p$loadings[,1:k]

     # First normalize the data (Column means)
     scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
     x_std_str = _array_to_string([1] * col_dim)

     pc_table_mean = add_postfix(pc_table, "_mean")
     plpy.execute(
         """
         CREATE TABLE {scaled_source_table} AS
             SELECT
                 row_id,
                 ({schema_madlib}.utils_normalize_data(
                                   row_vec,
                                   (select column_mean from {pc_table_mean}),
                                   '{x_std_str}'::double precision[]))
                     AS row_vec
             FROM {source_table}
         """.format(schema_madlib=schema_madlib,
                    pc_table_mean=pc_table_mean,
                    source_table=source_table,
                    scaled_source_table=scaled_source_table,
                    x_std_str=x_std_str))

     plpy.execute(
         """
         SELECT {schema_madlib}.matrix_mult('{scaled_source_table}',
                                             'trans=false,row=row_id, col=col_id, val=row_vec',
                                            '{pc_table}',
                                             'trans=TRUE, row=row_id, col=col_id, val=principal_components',
                                             '{out_table}',
                                             'row=row_id, col=col_id,val=row_vec');
         """.format(schema_madlib=schema_madlib,
                    scaled_source_table=scaled_source_table,
                    pc_table=pc_table,
                    out_table=out_table))

     # Step 3: Compute the Residual table (if required)
     # Residual table: res = mat - proj
     if residual_table or result_summary_table:
         create_temp_residual_table = False
         if not residual_table:
             create_temp_residual_table = True
             residual_table = "pg_temp." + unique_string() + "_temp_residual"
         approx_table = "pg_temp." + unique_string() + "_approx"
         # Build an approximate reconstruction of the data
         plpy.execute(
             """
             SELECT {schema_madlib}.matrix_mult('{out_table}',
                                                 'row=row_id, col=col_id, val=row_vec',
                                                '{pc_table}',
                                                 'row=row_id, col=col_id, val=principal_components',
                                                 '{approx_table}',
                                                 'row=row_id, col=col_id, val=row_vec');
             """.format(schema_madlib=schema_madlib,
                        out_table=out_table,
                        pc_table=pc_table,
                        approx_table=approx_table))

         # Compute the difference between the reconstruction and real data
         # Note that both the approximation and source data are recentered here
         plpy.execute(
             """
             SELECT {schema_madlib}.matrix_scale_and_add(
                                         '{scaled_source_table}',
                                         'row=row_id, col=col_id, val=row_vec',
                                         '{approx_table}',
                                         'row=row_id, col=col_id, val=row_vec',
                                         -1,
                                         '{residual_table}',
                                         'row=row_id, col=col_id, val=row_vec');
             """.format(schema_madlib=schema_madlib,
                        scaled_source_table=scaled_source_table,
                        approx_table=approx_table,
                        residual_table=residual_table))

         # Step 4: Compute the results summary table (if required)
         # If the residual table is not asked by the user, but he does ask for
         # result summary table, then we need to compute the residuals
         if result_summary_table:
             source_table_norm = plpy.execute(
                 """
                 SELECT {schema_madlib}.matrix_norm('{source_table}',
                                                    'row=row_id, col=col_id, val=row_vec') as r
                 """.format(schema_madlib=schema_madlib,
                            source_table=source_table,
                            row_id=row_id))[0]['r']

             # Compute the norm of the residual table
             residual_norm = plpy.execute(
                 """
                 SELECT {schema_madlib}.matrix_norm('{residual_table_name}',
                                                    'row=row_id, col=col_id, val=row_vec') as r
                 """.format(schema_madlib=schema_madlib,
                            residual_table_name=residual_table,
                            row_id=row_id))[0]['r']
             # Compute the relative error of the norm
             # Prevent division by zero
             if(source_table_norm > ZERO_THRESHOLD):
                 relative_residual_norm = residual_norm / source_table_norm
             else:
                 relative_residual_norm = 0
             plpy.execute(
                 """
                 CREATE TABLE {result_summary_table} ( exec_time FLOAT8,
                                                       residual_norm FLOAT8,
                                                       relative_residual_norm FLOAT8);
                 """.format(result_summary_table=result_summary_table))
             # Compute the time in milli-seconds
             t1 = time.time()
             dt = (t1 - t0) * 1000.

             plpy.execute(
                 """
                 INSERT INTO {result_summary_table} VALUES
                     ({dt},
                      {residual_norm}::double precision,
                      {relative_residual_norm}::double precision);
                 """.format(dt=dt,
                            residual_norm=residual_norm,
                            result_summary_table=result_summary_table,
                            relative_residual_norm=relative_residual_norm))

             plpy.execute("""
                      DROP TABLE IF EXISTS {approx_table};
                     """.format(approx_table=approx_table))
             if create_temp_residual_table:
                 plpy.execute("""
                      DROP TABLE IF EXISTS {residual_table};
                      """.format(residual_table=residual_table))

     plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table))
     plpy.execute("SET client_min_messages TO %s" % old_msg_level)
	"""
	@file pca_project.py_in

	@namespace pca
	"""

	import plpy
	import time

	from linalg.matrix_ops import cast_dense_input_table_to_correct_columns
	from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims
	from linalg.matrix_ops import get_dims
	from linalg.matrix_ops import validate_dense
	from linalg.matrix_ops import validate_sparse
	from utilities.utilities import __mad_version
	from utilities.utilities import unique_string
	from utilities.utilities import _assert
	from utilities.utilities import _array_to_string
	from utilities.validate_args import columns_exist_in_table
	from utilities.validate_args import table_exists
	from utilities.utilities import add_postfix


	version_wrapper = __mad_version()
	string_to_array = version_wrapper.select_vecfunc()
	array_to_string = version_wrapper.select_vec_return()
	ZERO_THRESHOLD = 1e-6


	# Dense PCA help function
	def pca_project_help(schema_madlib, usage_string=None, **kwargs):
	"""
	Given a usage string, give out function usage information.
	"""
	if usage_string is None:
	usage_string = ''

	if (usage_string.lower() == "usage"):
	return """
	----------------------------------------------------------------
	Usage
	----------------------------------------------------------------
	SELECT {schema_madlib}.pca_project (
	'tbl_source', -- Data table
	'tbl_pc', -- Table with principal componenents
	(obtained as output from pca_train)
	'tbl_result', -- Result table
	'row_id', -- Name of the column containing the row_id
	-- Optional Parameters
	----------------------------------------------------------------
	'tbl_residual', -- Residual table (Default: NULL)
	'tbl_result_summary', -- Result summary table (Default : NULL)
	);

	Output Tables
	--------------------------------------------------------------------
	The output is divided into three tables (two of which are optional)

	--------------------------------------------------------------------
	The output table ('tbl_result' above) encodes a dense matrix
	with the projection onto the principal components. The matrix contains
	the following columns:

	'row_id' INTEGER, -- Row id of the output matrix
	'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

	--------------------------------------------------------------------
	The residual table ('tbl_residual' above) encodes a dense residual
	matrix which has the following columns

	'row_id' INTEGER, -- Row id of the output matrix
	'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

	--------------------------------------------------------------------
	The result summary table ('tbl_result_summary' above) has the following columns

	'exec_time' INTEGER, -- Wall clock time (ms) of the function.
	'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
	'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals
	----------------------------------------------------------------
	""".format(schema_madlib=schema_madlib)
	else:
	return """
	----------------------------------------------------------------
	Summary: PCA Projection
	----------------------------------------------------------------
	PCA Projection: Projects a dataset to an already trained
	space of principal components.
	--
	For function usage information, run
	SELECT {schema_madlib}.pca_project('usage');
	--
	""".format(schema_madlib=schema_madlib)


	# Sparse PCA help function
	# ------------------------------------------------------------------------
	def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs):
	"""
	Given a usage string, give out function usage information.
	"""
	if usage_string is None:
	usage_string = ''

	if (usage_string.lower() == "usage"):
	return """
	----------------------------------------------------------------
	Usage
	----------------------------------------------------------------
	SELECT {schema_madlib}.pca_sparse_project (
	'tbl_source', -- Data table
	'tbl_pc', -- Table with principal componenents
	(obtained as output from pca_train)
	'tbl_result', -- Result table
	'row_id', -- Name of the column containing the row_id
	'col_id', -- Name of the column containing the col_id
	'val_id', -- Name of the column containing the val_id
	'row_dim' -- Row dimension of the sparse matrix
	'col_dim' -- Column dimension of the sparse matrix
	-- Optional Parameters
	----------------------------------------------------------------
	'tbl_residual', -- Residual table (Default: NULL)
	'tbl_result_summary', -- Result summary table (Default : NULL)
	);

	Output Tables
	----------------------------------------------------------------
	The output is divided into three tables (two of which are optional)

	-----------------------------------------------------------------------------------------
	The output table ('tbl_result' above) encodes a dense matrix
	with the projection onto the principal components. The matrix contains
	the following columns:

	'row_id' INTEGER, -- Row id of the output matrix
	'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

	-----------------------------------------------------------------------------------------
	The residual table ('tbl_residual' above) encodes a dense residual
	matrix which has the following columns

	'row_id' INTEGER, -- Row id of the output matrix
	'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix

	-----------------------------------------------------------------------------------------
	The result summary table ('tbl_result_summary' above) has the following columns

	'exec_time' INTEGER, -- Wall clock time (ms) of the function.
	'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
	'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals
	----------------------------------------------------------------
	""".format(schema_madlib=schema_madlib)
	else:
	return """
	----------------------------------------------------------------
	Summary: PCA Projection
	----------------------------------------------------------------
	PCA Projection: Projects a dataset to an already trained
	space of principal components.
	--
	For function usage information, run
	SELECT {schema_madlib}.pca_sparse_project('usage');
	--
	""".format(schema_madlib=schema_madlib)


	# Validate arguments: Same as pca
	# ------------------------------------------------------------------------
	def _validate_args(schema_madlib,
	source_table,
	pc_table,
	out_table,
	row_id,
	col_id=None,
	val_id=None,
	row_dim=None,
	col_dim=None,
	residual_table=None,
	result_summary_table=None):
	"""
	Validates all arguments passed to the PCA function

	Args:
	@param schema_madlib Name of MADlib schema
	@param source_table Name of the input table (containing data to project)
	@param pc_table Name of table with principal components (output by the training function)
	@param out_table Name of output table to store projection result
	@param row_id Name of the row_id column
	@param col_id Name of the col_id column (only for sparse matrices)
	@param val_id Name of the val_id column (only for sparse matrices)
	@param row_dim Number of rows in input matrix (only for sparse matrices)
	@param col_dim Number of columns in input matrix (only for sparse matrices)
	@param residual_table Name of the residual table (to store error in projection)
	@param result_summary_table Name of result summary table
	Returns:
	None
	Throws:
	plpy.error if any argument is invalid
	"""
	_assert(source_table is not None and table_exists(source_table),
	"PCA error: Source data table does not exist!")

	_assert(pc_table is not None and table_exists(pc_table),
	"PCA error: Principal comp. table does not exist!")

	_assert(table_exists(add_postfix(pc_table, "_mean")),
	"PCA error: Source data table column means does not exist!")

	# Make sure that the output table does not exist
	# Also check that the output table is not null
	_assert(out_table and out_table.strip(),
	"PCA error: Invalid output table name.")
	_assert(not table_exists(out_table, only_first_schema=True),
	"PCA error: Output table {0} already exists!".format(str(out_table)))

	# Check that the result summary table is not empty
	if result_summary_table is not None:
	_assert(result_summary_table.strip(),
	"PCA error: Invalid result summary table name!")
	_assert(not table_exists(result_summary_table, only_first_schema=True),
	"PCA error: Result summary table {0} already exists!".
	format(result_summary_table))

	# Check that the result summary table is not empty
	if residual_table is not None:
	_assert(residual_table.strip(),
	"PCA error: Invalid residual table name!")
	_assert(not table_exists(residual_table, only_first_schema=True),
	"PCA error: Residual table {0} already exists!".
	format(residual_table))

	# Check that the row_id exists
	_assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
	"PCA error: {1} column does not exist in {0}!".
	format(source_table, "NULL" if row_id is None else row_id))

	# For sparse inputs: Check that the row_id exists
	if col_id or val_id:
	_assert(col_id,
	"PCA error: Column ID should be provided if value ID is input!")
	_assert(val_id,
	"PCA error: Value ID should be provided if column ID is input!")
	_assert(columns_exist_in_table(source_table, [col_id], schema_madlib),
	"PCA error: {1} column does not exist in {0}!".
	format(source_table, col_id))
	_assert(columns_exist_in_table(source_table, [val_id], schema_madlib),
	"PCA error: {1} column does not exist in {0}!".
	format(source_table, val_id))
	_assert(row_dim > 0 and col_dim > 0,
	"PCA error: row_dim/col_dim should be positive integer")

	validate_sparse(source_table,
	{'row': row_id, 'col': col_id, 'val': val_id},
	checkCol=False)
	# ------------------------------------------------------------------------


	def pca_sparse_project(schema_madlib,
	source_table,
	pc_table,
	out_table,
	row_id,
	col_id,
	val_id,
	row_dim,
	col_dim,
	residual_table,
	result_summary_table,
	**kwargs):
	"""
	PCA projection of the matrix in source_table.

	This function is the specific call for pca projection. It projects
	the input matrix into the principal components.

	Args:
	@param schema_madlib Name of MADlib schema
	@param source_table Name of the input table (containing data to project)
	@param pc_table Name of table with principal components (output by the training function)
	@param out_table Name of output table to store projection result
	@param row_id Name of the row_id column
	@param col_id Name of the col_id column
	@param val_id Name of the val_id column
	@param row_dim Number of rows in input matrix
	@param col_dim Number of columns in input matrix
	@param residual_table Name of the residual table (to store error in projection)
	@param result_summary_table Name of result summary table
	Returns:
	None
	Throws:
	plpy.error if any argument is invalid
	"""

	# Reset the message level to avoid random messages
	old_msg_level = plpy.execute("""
	SELECT setting
	FROM pg_settings
	WHERE name='client_min_messages'
	""")[0]['setting']
	plpy.execute('SET client_min_messages TO warning')

	# Step 1: Validate the input arguments
	_validate_args(schema_madlib,
	source_table,
	pc_table,
	out_table,
	row_id,
	col_id,
	val_id,
	row_dim,
	col_dim,
	residual_table,
	result_summary_table)

	# Step 2: Create a copy of the sparse matrix and add row_dims and col_dims
	# Warning: This changes the column names of the table
	sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
	create_temp_sparse_matrix_table_with_dims(source_table,
	sparse_table_copy,
	row_id,
	col_id,
	val_id,
	row_dim,
	col_dim)

	# Step 3: Densify the input matrix
	x_dense = "pg_temp." + unique_string() + "_dense"
	plpy.execute("""
	SELECT {schema_madlib}.matrix_densify(
	'{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
	'{x_dense}', 'row=row_id, col=col_id,val=row_vec')
	""".format(**locals()))

	# Step 4: Pass the densified matrix to regular PCA
	pca_project(schema_madlib,
	x_dense,
	pc_table,
	out_table,
	'row_id',
	residual_table,
	result_summary_table)

	# Step 4: Clean up
	plpy.execute(
	"""
	DROP TABLE IF EXISTS {x_dense};
	DROP TABLE IF EXISTS {sparse_table_copy};
	""".format(x_dense=x_dense,
	sparse_table_copy=sparse_table_copy))

	plpy.execute("SET client_min_messages TO %s" % old_msg_level)


	# ------------------------------------------------------------------------
	def pca_project(schema_madlib,
	source_table,
	pc_table,
	out_table,
	row_id,
	residual_table,
	result_summary_table,
	**kwargs):
	"""
	PCA projection of the matrix in source_table.

	This function is the specific call for pca projection. It projects
	the input matrix into the principal components.

	Args:
	@param schema_madlib Name of MADlib schema
	@param source_table Name of the input table (containing data to project)
	@param pc_table Name of table with principal components (output by the training function)
	@param out_table Name of output table to store projection result
	@param row_id Name of the row_id column
	@param residual_table Name of the residual table (to store error in projection)
	@param result_summary_table Name of result summary table
	Returns:
	None
	Throws:
	plpy.error if any argument is invalid
	"""
	t0 = time.time() # measure the starting time

	# Reset the message level to avoid random messages
	old_msg_level = plpy.execute("""
	SELECT setting
	FROM pg_settings
	WHERE name='client_min_messages'
	""")[0]['setting']
	plpy.execute('SET client_min_messages TO warning')

	# Step 1: Validate the input arguments
	_validate_args(schema_madlib, source_table, pc_table, out_table,
	row_id, None, None, None, None,
	residual_table, result_summary_table)

	# Make sure that the table has row_id and row_vec
	source_table_copy = "pg_temp." + unique_string()
	need_new_column_names = cast_dense_input_table_to_correct_columns(
	schema_madlib, source_table, source_table_copy, row_id)

	if(need_new_column_names):
	source_table = source_table_copy

	[row_dim, col_dim] = get_dims(source_table,
	{'row': 'row_id', 'col': 'col_id',
	'val': 'row_vec'})
	validate_dense(source_table,
	{'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'},
	checkCol=False, row_dim=row_dim)

	# Step 2: Compute the PCA Projection matrix
	# The R code to perform this step is
	# p <- princomp(mat)
	# low_rank_representation <- mat %*% p$loadings[,1:k]

	# First normalize the data (Column means)
	scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
	x_std_str = _array_to_string([1] * col_dim)

	pc_table_mean = add_postfix(pc_table, "_mean")
	plpy.execute(
	"""
	CREATE TABLE {scaled_source_table} AS
	SELECT
	row_id,
	({schema_madlib}.utils_normalize_data(
	row_vec,
	(select column_mean from {pc_table_mean}),
	'{x_std_str}'::double precision[]))
	AS row_vec
	FROM {source_table}
	""".format(schema_madlib=schema_madlib,
	pc_table_mean=pc_table_mean,
	source_table=source_table,
	scaled_source_table=scaled_source_table,
	x_std_str=x_std_str))

	plpy.execute(
	"""
	SELECT {schema_madlib}.matrix_mult('{scaled_source_table}',
	'trans=false,row=row_id, col=col_id, val=row_vec',
	'{pc_table}',
	'trans=TRUE, row=row_id, col=col_id, val=principal_components',
	'{out_table}',
	'row=row_id, col=col_id,val=row_vec');
	""".format(schema_madlib=schema_madlib,
	scaled_source_table=scaled_source_table,
	pc_table=pc_table,
	out_table=out_table))

	# Step 3: Compute the Residual table (if required)
	# Residual table: res = mat - proj
	if residual_table or result_summary_table:
	create_temp_residual_table = False
	if not residual_table:
	create_temp_residual_table = True
	residual_table = "pg_temp." + unique_string() + "_temp_residual"
	approx_table = "pg_temp." + unique_string() + "_approx"
	# Build an approximate reconstruction of the data
	plpy.execute(
	"""
	SELECT {schema_madlib}.matrix_mult('{out_table}',
	'row=row_id, col=col_id, val=row_vec',
	'{pc_table}',
	'row=row_id, col=col_id, val=principal_components',
	'{approx_table}',
	'row=row_id, col=col_id, val=row_vec');
	""".format(schema_madlib=schema_madlib,
	out_table=out_table,
	pc_table=pc_table,
	approx_table=approx_table))

	# Compute the difference between the reconstruction and real data
	# Note that both the approximation and source data are recentered here
	plpy.execute(
	"""
	SELECT {schema_madlib}.matrix_scale_and_add(
	'{scaled_source_table}',
	'row=row_id, col=col_id, val=row_vec',
	'{approx_table}',
	'row=row_id, col=col_id, val=row_vec',
	-1,
	'{residual_table}',
	'row=row_id, col=col_id, val=row_vec');
	""".format(schema_madlib=schema_madlib,
	scaled_source_table=scaled_source_table,
	approx_table=approx_table,
	residual_table=residual_table))

	# Step 4: Compute the results summary table (if required)
	# If the residual table is not asked by the user, but he does ask for
	# result summary table, then we need to compute the residuals
	if result_summary_table:
	source_table_norm = plpy.execute(
	"""
	SELECT {schema_madlib}.matrix_norm('{source_table}',
	'row=row_id, col=col_id, val=row_vec') as r
	""".format(schema_madlib=schema_madlib,
	source_table=source_table,
	row_id=row_id))[0]['r']

	# Compute the norm of the residual table
	residual_norm = plpy.execute(
	"""
	SELECT {schema_madlib}.matrix_norm('{residual_table_name}',
	'row=row_id, col=col_id, val=row_vec') as r
	""".format(schema_madlib=schema_madlib,
	residual_table_name=residual_table,
	row_id=row_id))[0]['r']
	# Compute the relative error of the norm
	# Prevent division by zero
	if(source_table_norm > ZERO_THRESHOLD):
	relative_residual_norm = residual_norm / source_table_norm
	else:
	relative_residual_norm = 0
	plpy.execute(
	"""
	CREATE TABLE {result_summary_table} ( exec_time FLOAT8,
	residual_norm FLOAT8,
	relative_residual_norm FLOAT8);
	""".format(result_summary_table=result_summary_table))
	# Compute the time in milli-seconds
	t1 = time.time()
	dt = (t1 - t0) * 1000.

	plpy.execute(
	"""
	INSERT INTO {result_summary_table} VALUES
	({dt},
	{residual_norm}::double precision,
	{relative_residual_norm}::double precision);
	""".format(dt=dt,
	residual_norm=residual_norm,
	result_summary_table=result_summary_table,
	relative_residual_norm=relative_residual_norm))

	plpy.execute("""
	DROP TABLE IF EXISTS {approx_table};
	""".format(approx_table=approx_table))
	if create_temp_residual_table:
	plpy.execute("""
	DROP TABLE IF EXISTS {residual_table};
	""".format(residual_table=residual_table))

	plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table))
	plpy.execute("SET client_min_messages TO %s" % old_msg_level)