| """ |
| @file pca_project.py_in |
| |
| @namespace pca |
| """ |
| |
| import plpy |
| import time |
| |
| from linalg.matrix_ops import cast_dense_input_table_to_correct_columns |
| from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims |
| from linalg.matrix_ops import get_dims |
| from linalg.matrix_ops import validate_dense |
| from linalg.matrix_ops import validate_sparse |
| from utilities.utilities import __mad_version |
| from utilities.utilities import unique_string |
| from utilities.utilities import _assert |
| from utilities.utilities import _array_to_string |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import table_exists |
| from utilities.utilities import add_postfix |
| |
| |
| version_wrapper = __mad_version() |
| string_to_array = version_wrapper.select_vecfunc() |
| array_to_string = version_wrapper.select_vec_return() |
| ZERO_THRESHOLD = 1e-6 |
| |
| |
| # Dense PCA help function |
| def pca_project_help(schema_madlib, usage_string=None, **kwargs): |
| """ |
| Given a usage string, give out function usage information. |
| """ |
| if usage_string is None: |
| usage_string = '' |
| |
| if (usage_string.lower() == "usage"): |
| return """ |
| ---------------------------------------------------------------- |
| Usage |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.pca_project ( |
| 'tbl_source', -- Data table |
| 'tbl_pc', -- Table with principal componenents |
| (obtained as output from pca_train) |
| 'tbl_result', -- Result table |
| 'row_id', -- Name of the column containing the row_id |
| -- Optional Parameters |
| ---------------------------------------------------------------- |
| 'tbl_residual', -- Residual table (Default: NULL) |
| 'tbl_result_summary', -- Result summary table (Default : NULL) |
| ); |
| |
| Output Tables |
| -------------------------------------------------------------------- |
| The output is divided into three tables (two of which are optional) |
| |
| -------------------------------------------------------------------- |
| The output table ('tbl_result' above) encodes a dense matrix |
| with the projection onto the principal components. The matrix contains |
| the following columns: |
| |
| 'row_id' INTEGER, -- Row id of the output matrix |
| 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix |
| |
| -------------------------------------------------------------------- |
| The residual table ('tbl_residual' above) encodes a dense residual |
| matrix which has the following columns |
| |
| 'row_id' INTEGER, -- Row id of the output matrix |
| 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix |
| |
| -------------------------------------------------------------------- |
| The result summary table ('tbl_result_summary' above) has the following columns |
| |
| 'exec_time' INTEGER, -- Wall clock time (ms) of the function. |
| 'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals |
| 'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals |
| ---------------------------------------------------------------- |
| """.format(schema_madlib=schema_madlib) |
| else: |
| return """ |
| ---------------------------------------------------------------- |
| Summary: PCA Projection |
| ---------------------------------------------------------------- |
| PCA Projection: Projects a dataset to an already trained |
| space of principal components. |
| -- |
| For function usage information, run |
| SELECT {schema_madlib}.pca_project('usage'); |
| -- |
| """.format(schema_madlib=schema_madlib) |
| |
| |
| # Sparse PCA help function |
| # ------------------------------------------------------------------------ |
| def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs): |
| """ |
| Given a usage string, give out function usage information. |
| """ |
| if usage_string is None: |
| usage_string = '' |
| |
| if (usage_string.lower() == "usage"): |
| return """ |
| ---------------------------------------------------------------- |
| Usage |
| ---------------------------------------------------------------- |
| SELECT {schema_madlib}.pca_sparse_project ( |
| 'tbl_source', -- Data table |
| 'tbl_pc', -- Table with principal componenents |
| (obtained as output from pca_train) |
| 'tbl_result', -- Result table |
| 'row_id', -- Name of the column containing the row_id |
| 'col_id', -- Name of the column containing the col_id |
| 'val_id', -- Name of the column containing the val_id |
| 'row_dim' -- Row dimension of the sparse matrix |
| 'col_dim' -- Column dimension of the sparse matrix |
| -- Optional Parameters |
| ---------------------------------------------------------------- |
| 'tbl_residual', -- Residual table (Default: NULL) |
| 'tbl_result_summary', -- Result summary table (Default : NULL) |
| ); |
| |
| Output Tables |
| ---------------------------------------------------------------- |
| The output is divided into three tables (two of which are optional) |
| |
| ----------------------------------------------------------------------------------------- |
| The output table ('tbl_result' above) encodes a dense matrix |
| with the projection onto the principal components. The matrix contains |
| the following columns: |
| |
| 'row_id' INTEGER, -- Row id of the output matrix |
| 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix |
| |
| ----------------------------------------------------------------------------------------- |
| The residual table ('tbl_residual' above) encodes a dense residual |
| matrix which has the following columns |
| |
| 'row_id' INTEGER, -- Row id of the output matrix |
| 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix |
| |
| ----------------------------------------------------------------------------------------- |
| The result summary table ('tbl_result_summary' above) has the following columns |
| |
| 'exec_time' INTEGER, -- Wall clock time (ms) of the function. |
| 'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals |
| 'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals |
| ---------------------------------------------------------------- |
| """.format(schema_madlib=schema_madlib) |
| else: |
| return """ |
| ---------------------------------------------------------------- |
| Summary: PCA Projection |
| ---------------------------------------------------------------- |
| PCA Projection: Projects a dataset to an already trained |
| space of principal components. |
| -- |
| For function usage information, run |
| SELECT {schema_madlib}.pca_sparse_project('usage'); |
| -- |
| """.format(schema_madlib=schema_madlib) |
| |
| |
| # Validate arguments: Same as pca |
| # ------------------------------------------------------------------------ |
| def _validate_args(schema_madlib, |
| source_table, |
| pc_table, |
| out_table, |
| row_id, |
| col_id=None, |
| val_id=None, |
| row_dim=None, |
| col_dim=None, |
| residual_table=None, |
| result_summary_table=None): |
| """ |
| Validates all arguments passed to the PCA function |
| |
| Args: |
| @param schema_madlib Name of MADlib schema |
| @param source_table Name of the input table (containing data to project) |
| @param pc_table Name of table with principal components (output by the training function) |
| @param out_table Name of output table to store projection result |
| @param row_id Name of the row_id column |
| @param col_id Name of the col_id column (only for sparse matrices) |
| @param val_id Name of the val_id column (only for sparse matrices) |
| @param row_dim Number of rows in input matrix (only for sparse matrices) |
| @param col_dim Number of columns in input matrix (only for sparse matrices) |
| @param residual_table Name of the residual table (to store error in projection) |
| @param result_summary_table Name of result summary table |
| Returns: |
| None |
| Throws: |
| plpy.error if any argument is invalid |
| """ |
| _assert(source_table is not None and table_exists(source_table), |
| "PCA error: Source data table does not exist!") |
| |
| _assert(pc_table is not None and table_exists(pc_table), |
| "PCA error: Principal comp. table does not exist!") |
| |
| _assert(table_exists(add_postfix(pc_table, "_mean")), |
| "PCA error: Source data table column means does not exist!") |
| |
| # Make sure that the output table does not exist |
| # Also check that the output table is not null |
| _assert(out_table and out_table.strip(), |
| "PCA error: Invalid output table name.") |
| _assert(not table_exists(out_table, only_first_schema=True), |
| "PCA error: Output table {0} already exists!".format(str(out_table))) |
| |
| # Check that the result summary table is not empty |
| if result_summary_table is not None: |
| _assert(result_summary_table.strip(), |
| "PCA error: Invalid result summary table name!") |
| _assert(not table_exists(result_summary_table, only_first_schema=True), |
| "PCA error: Result summary table {0} already exists!". |
| format(result_summary_table)) |
| |
| # Check that the result summary table is not empty |
| if residual_table is not None: |
| _assert(residual_table.strip(), |
| "PCA error: Invalid residual table name!") |
| _assert(not table_exists(residual_table, only_first_schema=True), |
| "PCA error: Residual table {0} already exists!". |
| format(residual_table)) |
| |
| # Check that the row_id exists |
| _assert(columns_exist_in_table(source_table, [row_id], schema_madlib), |
| "PCA error: {1} column does not exist in {0}!". |
| format(source_table, "NULL" if row_id is None else row_id)) |
| |
| # For sparse inputs: Check that the row_id exists |
| if col_id or val_id: |
| _assert(col_id, |
| "PCA error: Column ID should be provided if value ID is input!") |
| _assert(val_id, |
| "PCA error: Value ID should be provided if column ID is input!") |
| _assert(columns_exist_in_table(source_table, [col_id], schema_madlib), |
| "PCA error: {1} column does not exist in {0}!". |
| format(source_table, col_id)) |
| _assert(columns_exist_in_table(source_table, [val_id], schema_madlib), |
| "PCA error: {1} column does not exist in {0}!". |
| format(source_table, val_id)) |
| _assert(row_dim > 0 and col_dim > 0, |
| "PCA error: row_dim/col_dim should be positive integer") |
| |
| validate_sparse(source_table, |
| {'row': row_id, 'col': col_id, 'val': val_id}, |
| checkCol=False) |
| # ------------------------------------------------------------------------ |
| |
| |
| def pca_sparse_project(schema_madlib, |
| source_table, |
| pc_table, |
| out_table, |
| row_id, |
| col_id, |
| val_id, |
| row_dim, |
| col_dim, |
| residual_table, |
| result_summary_table, |
| **kwargs): |
| """ |
| PCA projection of the matrix in source_table. |
| |
| This function is the specific call for pca projection. It projects |
| the input matrix into the principal components. |
| |
| Args: |
| @param schema_madlib Name of MADlib schema |
| @param source_table Name of the input table (containing data to project) |
| @param pc_table Name of table with principal components (output by the training function) |
| @param out_table Name of output table to store projection result |
| @param row_id Name of the row_id column |
| @param col_id Name of the col_id column |
| @param val_id Name of the val_id column |
| @param row_dim Number of rows in input matrix |
| @param col_dim Number of columns in input matrix |
| @param residual_table Name of the residual table (to store error in projection) |
| @param result_summary_table Name of result summary table |
| Returns: |
| None |
| Throws: |
| plpy.error if any argument is invalid |
| """ |
| |
| # Reset the message level to avoid random messages |
| old_msg_level = plpy.execute(""" |
| SELECT setting |
| FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| plpy.execute('SET client_min_messages TO warning') |
| |
| # Step 1: Validate the input arguments |
| _validate_args(schema_madlib, |
| source_table, |
| pc_table, |
| out_table, |
| row_id, |
| col_id, |
| val_id, |
| row_dim, |
| col_dim, |
| residual_table, |
| result_summary_table) |
| |
| # Step 2: Create a copy of the sparse matrix and add row_dims and col_dims |
| # Warning: This changes the column names of the table |
| sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy" |
| create_temp_sparse_matrix_table_with_dims(source_table, |
| sparse_table_copy, |
| row_id, |
| col_id, |
| val_id, |
| row_dim, |
| col_dim) |
| |
| # Step 3: Densify the input matrix |
| x_dense = "pg_temp." + unique_string() + "_dense" |
| plpy.execute(""" |
| SELECT {schema_madlib}.matrix_densify( |
| '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}', |
| '{x_dense}', 'row=row_id, col=col_id,val=row_vec') |
| """.format(**locals())) |
| |
| # Step 4: Pass the densified matrix to regular PCA |
| pca_project(schema_madlib, |
| x_dense, |
| pc_table, |
| out_table, |
| 'row_id', |
| residual_table, |
| result_summary_table) |
| |
| # Step 4: Clean up |
| plpy.execute( |
| """ |
| DROP TABLE IF EXISTS {x_dense}; |
| DROP TABLE IF EXISTS {sparse_table_copy}; |
| """.format(x_dense=x_dense, |
| sparse_table_copy=sparse_table_copy)) |
| |
| plpy.execute("SET client_min_messages TO %s" % old_msg_level) |
| |
| |
| # ------------------------------------------------------------------------ |
| def pca_project(schema_madlib, |
| source_table, |
| pc_table, |
| out_table, |
| row_id, |
| residual_table, |
| result_summary_table, |
| **kwargs): |
| """ |
| PCA projection of the matrix in source_table. |
| |
| This function is the specific call for pca projection. It projects |
| the input matrix into the principal components. |
| |
| Args: |
| @param schema_madlib Name of MADlib schema |
| @param source_table Name of the input table (containing data to project) |
| @param pc_table Name of table with principal components (output by the training function) |
| @param out_table Name of output table to store projection result |
| @param row_id Name of the row_id column |
| @param residual_table Name of the residual table (to store error in projection) |
| @param result_summary_table Name of result summary table |
| Returns: |
| None |
| Throws: |
| plpy.error if any argument is invalid |
| """ |
| t0 = time.time() # measure the starting time |
| |
| # Reset the message level to avoid random messages |
| old_msg_level = plpy.execute(""" |
| SELECT setting |
| FROM pg_settings |
| WHERE name='client_min_messages' |
| """)[0]['setting'] |
| plpy.execute('SET client_min_messages TO warning') |
| |
| # Step 1: Validate the input arguments |
| _validate_args(schema_madlib, source_table, pc_table, out_table, |
| row_id, None, None, None, None, |
| residual_table, result_summary_table) |
| |
| # Make sure that the table has row_id and row_vec |
| source_table_copy = "pg_temp." + unique_string() |
| need_new_column_names = cast_dense_input_table_to_correct_columns( |
| schema_madlib, source_table, source_table_copy, row_id) |
| |
| if(need_new_column_names): |
| source_table = source_table_copy |
| |
| [row_dim, col_dim] = get_dims(source_table, |
| {'row': 'row_id', 'col': 'col_id', |
| 'val': 'row_vec'}) |
| validate_dense(source_table, |
| {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'}, |
| checkCol=False, row_dim=row_dim) |
| |
| # Step 2: Compute the PCA Projection matrix |
| # The R code to perform this step is |
| # p <- princomp(mat) |
| # low_rank_representation <- mat %*% p$loadings[,1:k] |
| |
| # First normalize the data (Column means) |
| scaled_source_table = "pg_temp." + unique_string() + "_scaled_table" |
| x_std_str = _array_to_string([1] * col_dim) |
| |
| pc_table_mean = add_postfix(pc_table, "_mean") |
| plpy.execute( |
| """ |
| CREATE TABLE {scaled_source_table} AS |
| SELECT |
| row_id, |
| ({schema_madlib}.utils_normalize_data( |
| row_vec, |
| (select column_mean from {pc_table_mean}), |
| '{x_std_str}'::double precision[])) |
| AS row_vec |
| FROM {source_table} |
| """.format(schema_madlib=schema_madlib, |
| pc_table_mean=pc_table_mean, |
| source_table=source_table, |
| scaled_source_table=scaled_source_table, |
| x_std_str=x_std_str)) |
| |
| plpy.execute( |
| """ |
| SELECT {schema_madlib}.matrix_mult('{scaled_source_table}', |
| 'trans=false,row=row_id, col=col_id, val=row_vec', |
| '{pc_table}', |
| 'trans=TRUE, row=row_id, col=col_id, val=principal_components', |
| '{out_table}', |
| 'row=row_id, col=col_id,val=row_vec'); |
| """.format(schema_madlib=schema_madlib, |
| scaled_source_table=scaled_source_table, |
| pc_table=pc_table, |
| out_table=out_table)) |
| |
| # Step 3: Compute the Residual table (if required) |
| # Residual table: res = mat - proj |
| if residual_table or result_summary_table: |
| create_temp_residual_table = False |
| if not residual_table: |
| create_temp_residual_table = True |
| residual_table = "pg_temp." + unique_string() + "_temp_residual" |
| approx_table = "pg_temp." + unique_string() + "_approx" |
| # Build an approximate reconstruction of the data |
| plpy.execute( |
| """ |
| SELECT {schema_madlib}.matrix_mult('{out_table}', |
| 'row=row_id, col=col_id, val=row_vec', |
| '{pc_table}', |
| 'row=row_id, col=col_id, val=principal_components', |
| '{approx_table}', |
| 'row=row_id, col=col_id, val=row_vec'); |
| """.format(schema_madlib=schema_madlib, |
| out_table=out_table, |
| pc_table=pc_table, |
| approx_table=approx_table)) |
| |
| # Compute the difference between the reconstruction and real data |
| # Note that both the approximation and source data are recentered here |
| plpy.execute( |
| """ |
| SELECT {schema_madlib}.matrix_scale_and_add( |
| '{scaled_source_table}', |
| 'row=row_id, col=col_id, val=row_vec', |
| '{approx_table}', |
| 'row=row_id, col=col_id, val=row_vec', |
| -1, |
| '{residual_table}', |
| 'row=row_id, col=col_id, val=row_vec'); |
| """.format(schema_madlib=schema_madlib, |
| scaled_source_table=scaled_source_table, |
| approx_table=approx_table, |
| residual_table=residual_table)) |
| |
| # Step 4: Compute the results summary table (if required) |
| # If the residual table is not asked by the user, but he does ask for |
| # result summary table, then we need to compute the residuals |
| if result_summary_table: |
| source_table_norm = plpy.execute( |
| """ |
| SELECT {schema_madlib}.matrix_norm('{source_table}', |
| 'row=row_id, col=col_id, val=row_vec') as r |
| """.format(schema_madlib=schema_madlib, |
| source_table=source_table, |
| row_id=row_id))[0]['r'] |
| |
| # Compute the norm of the residual table |
| residual_norm = plpy.execute( |
| """ |
| SELECT {schema_madlib}.matrix_norm('{residual_table_name}', |
| 'row=row_id, col=col_id, val=row_vec') as r |
| """.format(schema_madlib=schema_madlib, |
| residual_table_name=residual_table, |
| row_id=row_id))[0]['r'] |
| # Compute the relative error of the norm |
| # Prevent division by zero |
| if(source_table_norm > ZERO_THRESHOLD): |
| relative_residual_norm = residual_norm / source_table_norm |
| else: |
| relative_residual_norm = 0 |
| plpy.execute( |
| """ |
| CREATE TABLE {result_summary_table} ( exec_time FLOAT8, |
| residual_norm FLOAT8, |
| relative_residual_norm FLOAT8); |
| """.format(result_summary_table=result_summary_table)) |
| # Compute the time in milli-seconds |
| t1 = time.time() |
| dt = (t1 - t0) * 1000. |
| |
| plpy.execute( |
| """ |
| INSERT INTO {result_summary_table} VALUES |
| ({dt}, |
| {residual_norm}::double precision, |
| {relative_residual_norm}::double precision); |
| """.format(dt=dt, |
| residual_norm=residual_norm, |
| result_summary_table=result_summary_table, |
| relative_residual_norm=relative_residual_norm)) |
| |
| plpy.execute(""" |
| DROP TABLE IF EXISTS {approx_table}; |
| """.format(approx_table=approx_table)) |
| if create_temp_residual_table: |
| plpy.execute(""" |
| DROP TABLE IF EXISTS {residual_table}; |
| """.format(residual_table=residual_table)) |
| |
| plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table)) |
| plpy.execute("SET client_min_messages TO %s" % old_msg_level) |