| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file pca_project.sql_in |
| * |
| * @brief Principal Component Analysis Projection |
| * |
| * @sa For a brief introduction to Principal Component Analysis, see the module |
| * description \ref grp_pca. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_pca_project |
| |
| |
| <div class ="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#project">Projection Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#notes">Notes</a></li> |
| <li class="level1"><a href="#background_project">Technical Background</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Projects a higher dimensional data point to a lower |
| dimensional subspace spanned by principal components learned through the PCA training |
| procedure. |
| |
| |
| Principal component projection is a mathematical procedure that projects high |
| dimensional data onto a lower dimensional space. This lower dimensional space |
| is defined by the \f$ k \f$ principal components with the highest variance in |
| the training data. |
| |
| More details on the mathematics of PCA can be found in \ref |
| grp_pca_train and some details about principal component projection calculations |
| can be found in the \ref background_project "Technical Background". |
| |
| |
| @anchor project |
| @par Projection Function |
| The projection functions are slightly different for dense and sparse matrices. For dense matrices: |
| <pre class="syntax"> |
| madlib.pca_project( source_table, |
| pc_table, |
| out_table, |
| row_id, |
| residual_table, |
| result_summary_table |
| ) |
| </pre> |
| For sparse matrices: |
| <pre class="syntax"> |
| madlib.pca_sparse_project( source_table, |
| pc_table, |
| out_table, |
| row_id, |
| col_id, -- Sparse matrices only |
| val_id, -- Sparse matrices only |
| row_dim, -- Sparse matrices only |
| col_dim, -- Sparse matrices only |
| residual_table, |
| result_summary_table |
| ) |
| </pre> |
| |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>TEXT. Source table name. |
| Identical to \ref pca_train, the input data matrix should have \f$ N \f$ rows |
| and \f$ M \f$ columns, where \f$ N \f$ is the number of data points, and \f$ M |
| \f$ is the number of features for each data point. |
| |
| The input table for <em> pca_project </em> is expected to be in the one of the |
| two standard MADlib dense matrix formats, and the sparse input table for <em> |
| pca_sparse_project </em> should be in the standard MADlib sparse matrix format. |
| These formats are described in the documentation for \ref grp_pca_train.</DD> |
| |
| <DT>pc_table</DT> |
| <DD>TEXT. Table name for the table containing principal components. </DD> |
| |
| <DT>out_table</DT> |
| <DD>TEXT. Name of the table that will contain the low-dimensional representation of the input data. |
| |
| The <em>out_table</em> encodes a dense matrix with the projection onto the |
| principal components. The table has the following columns: |
| |
| <table class="output"> |
| <tr> |
| <th>row_id</th> |
| <td>Row id of the output matrix.</td> |
| </tr> |
| <tr> |
| <th>row_vec</th> |
| <td>A vector containing elements in the row of the matrix.</td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>row_id</DT> |
| <DD>TEXT. Column name containing the row IDs in the input source table. The column |
| should be of type INT (or a type that can be cast to INT) and should only contain values between 1 and |
| <em>N</em>. For dense matrix format, it should contain all continguous integers from 1 to <em>N</em> |
| describing the full matrix.</DD> |
| |
| <DT>col_id</DT> |
| <DD>TEXT. Column name containing the column IDs in sparse matrix representation. |
| The column should be of type INT (or a type that can be cast to INT) |
| and should only contain values between 1 and <em>M</em>. <em>This parameter applies to |
| sparse matrices only.</em></DD> |
| |
| <DT>val_id</DT> |
| <DD>TEXT. Name of 'val_id' column in sparse matrix representation defining the values of the nonzero entries. |
| <em>This parameter applies to sparse matrices only.</em></DD> |
| |
| <DT>row_dim</DT> |
| <DD>INTEGER. The actual number of rows in the matrix. That is, |
| if the matrix was transformed into dense format, this is the number of rows |
| it would have. |
| <em>This parameter applies to sparse matrices only.</em></DD> |
| |
| <DT>col_dim</DT> |
| <DD>INTEGER. The actual number of columns in the matrix. That is, |
| if the matrix was transformed into dense format, this is the number of columns |
| it would have. |
| <em>This parameter applies to sparse matrices only.</em></DD> |
| |
| @note The parameters 'row_dim' and 'col_dim' could actually be inferred from the |
| sparse matrix representation, so they will be removed in the future. |
| For now they are maintained for backward compatability so you must enter them. |
| Making 'row_dim' or 'col_dim' larger than the actual matrix has the effect of padding it with |
| zeros, which is probably not useful. |
| |
| <DT>residual_table (optional)</DT> |
| <DD>TEXT, default: NULL. Name of the optional residual table. |
| |
| The <em>residual_table</em> encodes a dense residual matrix. The table has the following columns: |
| |
| <table class="output"> |
| <tr> |
| <th>row_id</th> |
| <td>Row id of the output matrix.</td> |
| </tr> |
| <tr> |
| <th>row_vec</th> |
| <td>A vector containing elements in the row of the residual matrix.</td> |
| </tr> |
| </table> |
| </DD> |
| |
| <DT>result_summary_table (optional)</DT> |
| <DD>TEXT, default: NULL. Name of the optional summary table. |
| |
| The <em>result_summary_table</em> contains information about the performance time of the PCA projection. The table has the following columns: |
| <table class="output"> |
| <tr> |
| <th>exec_time</th> |
| <td>Elapsed time (ms) for execution of the function.</td> |
| </tr> |
| <tr> |
| <th>residual_norm</th> |
| <td>Absolute error of the residuals.</td> |
| </tr> |
| <tr> |
| <th>relative_residual_norm</th> |
| <td>Relative error of the residuals.</td> |
| </tr> |
| </table></DD> |
| </DL> |
| |
| @anchor examples |
| @examp |
| -# View online help for the PCA projection function: |
| <pre class="example"> |
| SELECT madlib.pca_project(); |
| </pre> |
| |
| -# Create sample data in dense matrix form: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat; |
| CREATE TABLE mat (id integer, |
| row_vec double precision[] |
| ); |
| INSERT INTO mat VALUES |
| (1, '{1,2,3}'), |
| (2, '{2,1,2}'), |
| (3, '{3,2,1}'); |
| </pre> |
| |
| -# Run the PCA function for a specified number of principal components and view the results: |
| <pre class="example"> |
| DROP TABLE IF EXISTS result_table, result_table_mean; |
| SELECT madlib.pca_train('mat', -- Source table |
| 'result_table', -- Output table |
| 'id', -- Row id of source table |
| 2); -- Number of principal components |
| SELECT * FROM result_table ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion |
| --------+--------------------------------------------------------------+-------------------+------------------- |
| 1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} | 1.41421356237309 | 0.857142857142244 |
| 2 | {0,1,0} | 0.577350269189626 | 0.142857142857041 |
| (2 rows) |
| </pre> |
| |
| -# Project the original data to a lower dimensional representation and view the result of the projection: |
| <pre class="example"> |
| DROP TABLE IF EXISTS residual_table, result_summary_table, out_table; |
| SELECT madlib.pca_project( 'mat', |
| 'result_table', |
| 'out_table', |
| 'id', |
| 'residual_table', |
| 'result_summary_table' |
| ); |
| SELECT * FROM out_table ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | row_vec |
| --------+-------------------------------------- |
| 1 | {-1.41421356237309,-0.33333333333} |
| 2 | {2.77555756157677e-17,0.66666666667} |
| 3 | {1.41421356237309,-0.33333333333} |
| (3 rows) |
| </pre> |
| Check the error in the projection: |
| <pre class="example"> |
| SELECT * FROM result_summary_table; |
| </pre> |
| <pre class="result"> |
| exec_time | residual_norm | relative_residual_norm |
| ---------------+-------------------+------------------------ |
| 331.792116165 | 5.89383520611e-16 | 9.68940539229e-17 |
| (1 row) |
| </pre> |
| Check the residuals: |
| <pre class="example"> |
| SELECT * FROM residual_table ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | row_vec |
| --------+-------------------------------------------------------------------- |
| 1 | {-2.22044604925031e-16,-1.11022302462516e-16,3.33066907387547e-16} |
| 2 | {-1.12243865646685e-18,0,4.7381731349413e-17} |
| 3 | {2.22044604925031e-16,1.11022302462516e-16,-3.33066907387547e-16} |
| (3 rows) |
| </pre> |
| |
| -# Now we use grouping in dense form to learn different models for different groups. |
| First, we create sample data in dense matrix form with a grouping column. |
| Note we actually have different matrix sizes for the different groups, which |
| is allowed for dense: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_group; |
| CREATE TABLE mat_group ( |
| id integer, |
| row_vec double precision[], |
| matrix_id integer |
| ); |
| INSERT INTO mat_group VALUES |
| (1, '{1,2,3}', 1), |
| (2, '{2,1,2}', 1), |
| (3, '{3,2,1}', 1), |
| (4, '{1,2,3,4,5}', 2), |
| (5, '{2,5,2,4,1}', 2), |
| (6, '{5,4,3,2,1}', 2); |
| </pre> |
| |
| -# Run the PCA function with grouping for a specified proportion of variance and view the results: |
| <pre class="example"> |
| DROP TABLE IF EXISTS result_table_group, result_table_group_mean; |
| SELECT madlib.pca_train('mat_group', -- Source table |
| 'result_table_group', -- Output table |
| 'id', -- Row id of source table |
| 0.8, -- Proportion of variance |
| 'matrix_id'); -- Grouping column |
| SELECT * FROM result_table_group ORDER BY matrix_id, row_id; |
| </pre> |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion | matrix_id |
| --------+------------------------------------------------------------------------------------------------+-----------------+-------------------+----------- |
| 1 | {0.707106781186548,0,-0.707106781186547} | 1.4142135623731 | 0.857142857142245 | 1 |
| 1 | {-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023} | 3.2315220311722 | 0.764102534485173 | 2 |
| 2 | {0.587384101786277,-0.485138064894743,0.311532046315153,-0.449458074050715,0.347212037159181} | 1.795531127192 | 0.235897465516047 | 2 |
| (3 rows) |
| </pre> |
| |
| -# Run the PCA projection on subsets of an input table based on grouping columns. |
| Note that the parameter 'pc_table' used for projection must be generated in training |
| using the same grouping columns. |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_group_projected; |
| SELECT madlib.pca_project('mat_group', |
| 'result_table_group', |
| 'mat_group_projected', |
| 'id'); |
| SELECT * FROM mat_group_projected ORDER BY matrix_id, row_id; |
| </pre> |
| <pre class="result"> |
| row_id | row_vec | matrix_id |
| --------+---------------------------------------+----------- |
| 1 | {1.4142135623731} | 1 |
| 2 | {7.40148683087139e-17} | 1 |
| 3 | {-1.4142135623731} | 1 |
| 4 | {-3.59290479201926,0.559694003674779} | 2 |
| 5 | {0.924092949098971,-2.00871628417505} | 2 |
| 6 | {2.66881184290186,1.44902228049511} | 2 |
| (6 rows) |
| </pre> |
| |
| -# Now let's look at sparse matrices. Create sample data in sparse matrix form: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_sparse; |
| CREATE TABLE mat_sparse ( |
| row_id integer, |
| col_id integer, |
| value double precision |
| ); |
| INSERT INTO mat_sparse VALUES |
| (1, 1, 1.0), |
| (2, 2, 2.0), |
| (3, 3, 3.0), |
| (4, 4, 4.0), |
| (1, 5, 5.0), |
| (2, 4, 6.0), |
| (3, 2, 7.0), |
| (4, 3, 8.0); |
| </pre> |
| As an aside, this is what the sparse matrix above looks like when |
| put in dense form: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_dense; |
| SELECT madlib.matrix_densify('mat_sparse', |
| 'row=row_id, col=col_id, val=value', |
| 'mat_dense'); |
| SELECT * FROM mat_dense ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | value |
| --------+------------- |
| 1 | {1,0,0,0,5} |
| 2 | {0,2,0,6,0} |
| 3 | {0,7,3,0,0} |
| 4 | {0,0,8,4,0} |
| (4 rows) |
| </pre> |
| |
| -# Run the PCA sparse function for a specified number of principal components and view the results: |
| <pre class="example">DROP TABLE IF EXISTS result_table, result_table_mean; |
| SELECT madlib.pca_sparse_train( 'mat_sparse', -- Source table |
| 'result_table', -- Output table |
| 'row_id', -- Row id of source table |
| 'col_id', -- Column id of source table |
| 'value', -- Value of matrix at row_id, col_id |
| 4, -- Actual number of rows in the matrix |
| 5, -- Actual number of columns in the matrix |
| 3); -- Number of principal components |
| SELECT * FROM result_table ORDER BY row_id; |
| </pre> |
| Result (with principal components truncated for readability): |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion |
| --------+----------------------------------------------+------------------+------------------- |
| 1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 0.436590030617467 |
| 2 | {-0.0647272661608605,0.877639526308692,... | 3.68408023747461 | 0.333748701544697 |
| 3 | {-0.0780380267884855,0.177956517174911,... | 3.05606908060098 | 0.229661267837836 |
| (3 rows) |
| </pre> |
| |
| -# Project the original sparse data to low-dimensional representation: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_sparse_out; |
| SELECT madlib.pca_sparse_project( |
| 'mat_sparse', |
| 'result_table', |
| 'mat_sparse_out', |
| 'row_id', |
| 'col_id', |
| 'value', |
| 4, |
| 5 |
| ); |
| SELECT * FROM mat_sparse_out ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | row_vec |
| --------+--------------------------------------------------------- |
| 1 | {4.66617015032369,-2.63552220635847,2.1865220849604} |
| 2 | {0.228360685652383,-1.21616275892926,-4.46864627611561} |
| 3 | {0.672067460100428,5.45249627172823,0.56445525585642} |
| 4 | {-5.5665982960765,-1.6008113064405,1.71766893529879} |
| (4 rows) |
| </pre> |
| |
| -# Now we use grouping in sparse form to learn different models for different groups. |
| First, we create sample data in sparse matrix form with a grouping column: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_sparse_group; |
| CREATE TABLE mat_sparse_group ( |
| row_id integer, |
| col_id integer, |
| value double precision, |
| matrix_id integer); |
| INSERT INTO mat_sparse_group VALUES |
| (1, 1, 1.0, 1), |
| (2, 2, 2.0, 1), |
| (3, 3, 3.0, 1), |
| (4, 4, 4.0, 1), |
| (1, 5, 5.0, 1), |
| (2, 4, 6.0, 2), |
| (3, 2, 7.0, 2), |
| (4, 3, 8.0, 2); |
| </pre> |
| |
| -# Run the PCA function with grouping for a specified proportion of variance |
| and view the results: |
| <pre class="example"> |
| DROP TABLE IF EXISTS result_table_group, result_table_group_mean; |
| SELECT madlib.pca_sparse_train( 'mat_sparse_group', -- Source table |
| 'result_table_group', -- Output table |
| 'row_id', -- Row id of source table |
| 'col_id', -- Column id of source table |
| 'value', -- Value of matrix at row_id, col_id |
| 4, -- Actual number of rows in the matrix |
| 5, -- Actual number of columns in the matrix |
| 0.8, -- Proportion of variance |
| 'matrix_id'); |
| SELECT * FROM result_table_group ORDER BY matrix_id, row_id; |
| </pre> |
| Result (with principal components truncated for readability): |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion | matrix_id |
| --------+--------------------------------------------+------------------+-------------------+----------- |
| 1 | {-0.17805696611353,0.0681313257646983,... | 2.73659933165925 | 0.544652792875481 | 1 |
| 2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 0.308800210823714 | 1 |
| 1 | {0,-0.479486114660443,... | 4.40325305087975 | 0.520500333693473 | 2 |
| 2 | {0,0.689230898585949,... | 3.7435566458567 | 0.376220573442628 | 2 |
| (4 rows) |
| </pre> |
| |
| -# Projection in sparse format with grouping: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_sparse_group_projected; |
| SELECT madlib.pca_sparse_project( |
| 'mat_sparse_group', |
| 'result_table_group', |
| 'mat_sparse_group_projected', |
| 'row_id', |
| 'col_id', |
| 'value', |
| 4, |
| 5 |
| ); |
| SELECT * FROM mat_sparse_group_projected ORDER BY matrix_id, row_id; |
| </pre> |
| <pre class="result"> |
| row_id | row_vec | matrix_id |
| --------+-----------------------------------------+----------- |
| 1 | {-4.00039298524261,-0.626820612715982} | 1 |
| 2 | {0.765350785238575,0.951348276645455} | 1 |
| 3 | {1.04951017256904,2.22388180170356} | 1 |
| 4 | {2.185532027435,-2.54840946563303} | 1 |
| 1 | {-0.627846810195469,-0.685031603549092} | 2 |
| 2 | {-1.64754249747757,-4.7662114622896} | 2 |
| 3 | {-3.98424961281857,4.13958468655255} | 2 |
| 4 | {6.25963892049161,1.31165837928614} | 2 |
| (8 rows) |
| </pre> |
| |
| @anchor notes |
| @par Notes |
| - This function is intended to operate on the principal component tables |
| generated by <em> pca_train </em> or <em> pca_sparse_train</em>. The MADlib PCA |
| functions generate a table containing the column-means in addition to a table |
| containing the principal components. If this table is not found by the MADlib |
| projection function, it will trigger an error. As long the principal component |
| tables are created with MADlib functions, then the column-means table will be |
| automatically found by the MADlib projection functions. |
| |
| - Because of the centering step in PCA projection |
| (see "Technical Background"), sparse matrices almost always |
| become dense during the projection |
| process. Thus, this implementation automatically densifies sparse matrix input, |
| and there should be no expected performance improvement in using sparse matrix |
| input over dense matrix input. |
| |
| - Table names can be optionally schema qualified (current_schemas() is |
| searched if a schema name is not provided) and all table and column names |
| should follow case-sensitivity and quoting rules per the database. |
| (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'. |
| If mixed-case or multi-byte characters are desired for entity names then the |
| string should be double-quoted; in this case the input would be '"MyTable"'). |
| |
| - If the input table for pca_project (pca_sparse_project) contains grouping columns, |
| the same grouping columns must be used in the training function used to generate the |
| principal components too. |
| |
| |
| @anchor background_project |
| @par Technical Background |
| |
| Given a table containing some principal components \f$ \boldsymbol P \f$ and |
| some input data \f$ \boldsymbol X \f$, the low-dimensional representation \f$ |
| {\boldsymbol X}' \f$ is computed as \f{align*}{ {\boldsymbol {\hat{X}}} & = |
| {\boldsymbol X} - \vec{e} \hat{x}^T \\ {\boldsymbol X}' & = {\boldsymbol |
| {\hat {X}}} {\boldsymbol P}. \f} where \f$\hat{x} \f$ is the column means of |
| \f$ \boldsymbol X \f$ and \f$ \vec{e} \f$ is the vector of all ones. This |
| step is equivalent to centering the data around the origin. |
| |
| The residual table \f$ \boldsymbol R \f$ is a measure of how well the |
| low-dimensional representation approximates the true input data, and is computed |
| as \f[ {\boldsymbol R} = {\boldsymbol {\hat{X}}} - {\boldsymbol X}' {\boldsymbol |
| P}^T. \f] A residual matrix with entries mostly close to zero indicates a good |
| representation. |
| |
| The residual norm \f$ r \f$ is simply |
| \f[ |
| r = \|{\boldsymbol R}\|_F |
| \f] |
| where \f$ \|\cdot\|_F \f$ is the Frobenius norm. The relative residual norm \f$ r' \f$ is |
| \f[ |
| r' = \frac{ \|{\boldsymbol R}\|_F }{\|{\boldsymbol X}\|_F } |
| \f] |
| |
| |
| @anchor related |
| @par Related Topics |
| File pca_project.sql_in documenting the SQL functions |
| |
| \ref grp_pca_train |
| |
| |
| */ |
| |
| -- ----------------------------------------------------------------------- |
| -- PCA projection for Dense matrices |
| -- ----------------------------------------------------------------------- |
| /* |
| @brief Compute principal compoents for a dense matrix stored in a |
| database table |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_project( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Principal components table (output from pca module) |
| out_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| residual_table TEXT, -- Residual table (Default: NULL) |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca_project, pca_project) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Overloaded functions for optional parameters |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_project( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Principal components table (output from pca module) |
| out_table TEXT, -- Output table name for the principal components |
| row_id TEXT -- Column name for the ID for each row |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_project($1, $2, $3, $4, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_project( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Principal components table (output from pca module) |
| out_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| residual_table TEXT -- Residual table (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_project($1, $2, $3, $4, $5, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| -- Help and usage functions |
| ----------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_project( |
| usage_string TEXT -- Usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| PythonFunction(pca, pca_project, pca_project_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_project() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.pca_project(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| |
| -- ----------------------------------------------------------------------- |
| -- PCA sparse projection for dense matrices |
| -- ----------------------------------------------------------------------- |
| /* |
| @brief Compute principal compoents for a dense matrix stored in a |
| database table |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_project( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Principal components table (output from pca module) |
| out_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the row id |
| col_id TEXT, -- Column name for the col id |
| val_id TEXT, -- Column name for the value id |
| row_dim INT4, -- Row dimension of the sparse matrix |
| col_dim INT4, -- Column dimension of the sparse matrix |
| residual_table TEXT, -- Residual table (Default: NULL) |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca_project, pca_sparse_project) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| -- Overloaded functions for optional parameters |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_project( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Principal components table (output from pca module) |
| out_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the row id |
| col_id TEXT, -- Column name for the col id |
| val_id TEXT, -- Column name for the value id |
| row_dim INT4, -- Row dimension of the sparse matrix |
| col_dim INT4 -- Column dimension of the sparse matrix |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_project($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_project( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Principal components table (output from pca module) |
| out_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the row id |
| col_id TEXT, -- Column name for the col id |
| val_id TEXT, -- Column name for the value id |
| row_dim INT4, -- Row dimension of the sparse matrix |
| col_dim INT4, -- Column dimension of the sparse matrix |
| residual_table TEXT -- Residual table (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_project($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_project_union( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| out_table TEXT, -- Output table name |
| row_id TEXT, -- Column name for the ID for each row |
| original_row_id TEXT, -- copy of the row_id originally passed |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| grouping_cols_clause TEXT, -- Part of the SQL query to be used with grouping_cols |
| residual_table TEXT, -- Residual table name |
| result_summary_table TEXT, -- Table name to store summary of results (Default: NULL) |
| grp_id INTEGER, -- a place holder id for each group |
| grouping_where_clause TEXT, -- WHERE clause using grouping_cols |
| sparse_where_condition TEXT, -- WHERE clause used when creating temp sparse matrix table with dims |
| select_grouping_cols TEXT, -- SELECT clause using grouping_cols |
| grouping_cols_values TEXT, -- distinct values of the grouping_cols |
| temp_source_table_columns TEXT, -- SELECT caluse for creating temporary copy of the source_table |
| temp_pc_table_columns TEXT, -- non grouping_cols of the source_table |
| is_sparse BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices |
| col_id TEXT, -- sparse representation based detail |
| val_id TEXT, -- sparse representation based detail |
| row_dim INTEGER, -- sparse representation based detail |
| col_dim INTEGER -- sparse representation based detail |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca_project, _pca_project_union) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| -- Help and usage functions |
| ----------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_project( |
| usage_string TEXT -- Usage string |
| ) |
| RETURNS VARCHAR AS $$ |
| PythonFunction(pca, pca_project, pca_sparse_project_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_project() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.pca_sparse_project(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |