| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file pca.sql_in |
| * |
| * @brief Principal Component Analysis |
| * |
| * @sa For a brief introduction to Principal Component Analysis, see the module |
| * description \ref grp_pca. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| |
| @addtogroup grp_pca_train |
| |
| <div class ="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#pca_train">About</a></li> |
| <li class="level1"><a href="#help">Online Help</a></li> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#output">Output Tables</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#seealso">See Also</a></li> |
| <li class="level1"><a href="#background_pca">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @anchor pca_train |
| @about |
| Principal component analysis (PCA) is a mathematical procedure that uses an |
| orthogonal transformation to convert a set of observations of possibly |
| correlated variables into a set of values of linearly uncorrelated variables |
| called principal components. This transformation is defined in such a way that |
| the first principal component has the largest possible variance (i.e., |
| accounts for as much of the variability in the data as possible), and each |
| succeeding component in turn has the highest variance possible under the |
| constraint that it be orthogonal to (i.e., uncorrelated with) the preceding |
| components. |
| |
| See the \ref background_pca "Technical Background" for an introduction to |
| principal component analysis and the implementation notes. |
| |
| @anchor help |
| @par Online Help |
| View short help messages using the following statements: |
| @verbatim |
| -- Summary of PCA projection |
| madlib.pca_train() |
| madlib.pca_train('?') |
| madlib.pca_train('help') |
| |
| -- Training function syntax and output table format |
| madlib.pca_train('usage') |
| |
| -- Summary of PCA projection with sparse matrices |
| madlib.pca_sparse_train() |
| madlib.pca_sparse_train('?') |
| madlib.pca_sparse_train('help') |
| |
| -- Training function syntax and output table format |
| madlib.pca_sparse_train('usage') |
| @endverbatim |
| |
| @anchor train |
| @par Training Function |
| The training functions have the following formats: |
| @verbatim |
| pca_project( source_table, out_table, row_id, |
| k, grouping_cols:= NULL, |
| lanczos_iter := min(k+40, <smallest_matrix_dimension>), |
| use_correlation := False, result_summary_table := NULL) |
| @endverbatim |
| and |
| @verbatim |
| pca_sparse_project(source_table, out_table, |
| row_id, col_id, val_id, row_dim, col_dim, k, |
| grouping_cols := NULL, |
| lanczos_iter := min(k+40, <smallest_matrix_dimension>), |
| use_correlation := False, result_summary_table := NULL) |
| @endverbatim |
| |
| \note Because of the centering step in PCA (see |
| \ref background_pca "Technical Background"), sparse matrices almost always |
| become dense during the training process. Thus, this implementation |
| automatically densifies sparse matrix input, and there should be no expected |
| performance improvement in using sparse matrix input over dense matrix input. |
| |
| @par Arguments |
| \par |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>Text value. Name of the input table containing the data for PCA training. |
| The input data matrix should have \f$ N \f$ rows |
| and \f$ M \f$ columns, where \f$ N \f$ is the number of data points, and \f$ M |
| \f$ is the number of features for each data point. |
| |
| A dense input table is expected to be in the one of the |
| two standard MADlib dense matrix formats, and a sparse input table |
| should be in the standard MADlib sparse matrix format. |
| |
| The two standard MADlib dense matrix formats are |
| <pre>{TABLE|VIEW} <em>source_table</em> ( |
| <em>row_id</em> INTEGER, |
| row_vec FLOAT8[], |
| )</pre> |
| and |
| <pre>{TABLE|VIEW} <em>source_table</em> ( |
| <em>row_id</em> INTEGER, |
| col1 FLOAT8, |
| col2 FLOAT8, |
| ... |
| )</pre> |
| |
| Note that the column name <em>row_id</em> is taken as an input parameter, |
| and should contain a list of row indices (starting at 0) for the input matrix. |
| |
| The input table for sparse PCA is expected to be in the form: |
| |
| <pre>{TABLE|VIEW} <em>source_table</em> ( |
| ... |
| <em>row_id</em> INTEGER, |
| <em>col_id</em> INTEGER, |
| <em>val_id</em> FLOAT8, |
| ... |
| )</pre> |
| |
| The <em>row_id</em> and <em>col_id</em> columns specify which entries |
| in the matrix are nonzero, and the <em>val_id</em> column defines the values |
| of the nonzero entries. |
| </DD> |
| |
| <DT>out_table</DT> |
| <DD>Text value. Name of the table that will contain the principal components of the input data.</DD> |
| |
| <DT>row_id</DT> |
| <DD>Text value. Column name containing the row IDs in the input source table.</DD> |
| |
| <DT>col_id</DT> |
| <DD>Text value. Name of 'col_id' column in sparse matrix representation (sparse matrices only). </DD> |
| |
| <DT>val_id</DT> |
| <DD>Text value. Name of 'val_id' column in sparse matrix representation (sparse matrices only). </DD> |
| |
| <DT>row_dim</DT> |
| <DD>Integer value. The number of rows in the sparse matrix (sparse matrices only). </DD> |
| |
| <DT>col_dim</DT> |
| <DD>Integer value. The number of columns in the sparse matrix (sparse matrices only). </DD> |
| |
| <DT>k</DT> |
| <DD>Integer value. The number of principal components to calculate from the input data. </DD> |
| |
| <DT>grouping_cols</DT> |
| <DD>Text value. Currently <em>grouping_cols</em> is present as a placeholder for forward |
| compatibility. The parameter is planned to be implemented as a comma-separated |
| list of column names, with the source data grouped using the combination of all the columns. |
| An independent PCA model will be computed for each combination of the grouping columns. Default: NULL.</DD> |
| |
| <DT>lanczos_iter</DT> |
| <DD>Integer value. The number of Lanczos iterations for the SVD calculation. |
| The Lanczos iteration number roughly corresponds to the accuracy of the SVD |
| calculation, and a higher iteration number corresponds to greater accuracy |
| but longer computation time. The number of iterations must be at least as |
| large as the value of <em>k</em>, but no larger than the smallest dimension |
| of the matrix. If the iteration number is given as zero, then the default |
| number of iterations is used. |
| Default: minimum of {k+40, smallest matrix dimension}.</DD> |
| |
| <DT>use_correlation</DT> |
| <DD>Boolean value. Whether to use the correlation matrix for calculating the principal components instead of the covariance matrix. Currently |
| <em>use_correlation</em> is a placeholder for forward compatibility, and |
| this value must be set to false. Default: False. </DD> |
| |
| <DT>result_summary_table</DT> |
| <DD>Text value. Name of the optional summary table. Default: NULL.</DD> |
| </DL> |
| |
| @anchor output |
| @par Output Tables |
| |
| The output is divided into three tables (one of which is optional). |
| The output table (<em>'out_table'</em> above) encodes the principal components with the |
| |
| <em>k</em> highest eigenvalues. The table has the following columns: |
| \par |
| <DL class="arglist"> |
| <DT>row_id</DT> |
| <DD>Eigenvalue rank in descending order of the eigenvalue size.</DD> |
| |
| <DT>principal_components</DT> |
| <DD>Vectors containing elements of the principal components.</DD> |
| |
| <DT>eigen_values</DT> |
| <DD>The eigenvalues associated with each principal component.</DD> |
| </DL> |
| |
| In addition to the output table, a table containing the column means is also generated. |
| This table has the same name as the output table, with the string "_mean" appended to the end. |
| This table has only one column: |
| \par |
| <DL class="arglist"> |
| <DT>column_mean</DT> |
| <DD> A vector containing the column means for the input matrix.</DD> |
| </DL> |
| |
| The optional summary table contains information about the performance of the PCA. |
| This table has the following columns: |
| \par |
| <DL class="arglist"> |
| <DT>rows_used</DT> |
| <DD> Number of data points in the input.</DD> |
| <DT>exec_time (ms)</DT> |
| <DD>Number of milliseconds for the PCA calculation to run.</DD> |
| <DT>iter</DT> |
| <DD>Number of iterations used in the SVD calculation. </DD> |
| <DT>recon_error</DT> |
| <DD>The absolute error in the SVD approximation. </DD> |
| <DT>relative_recon_error</DT> |
| <DD>The relative error in the SVD approximation. </DD> |
| <DT>use_correlation</DT> |
| <DD>Indicates if the correlation matrix was used. </DD> |
| </DL> |
| |
| @anchor examples |
| @examp |
| -# Create the sample data. |
| @verbatim |
| sql> DROP TABLE IF EXISTS mat; |
| CREATE TABLE mat ( |
| row_id integer, |
| row_vec double precision[] |
| ); |
| |
| sql> COPY mat (row_id, row_vec) FROM stdin; |
| 0 {1,2,3} |
| 1 {2,1,2} |
| 2 {3,2,1} |
| \. |
| |
| @endverbatim |
| -# Run the PCA function: |
| @verbatim |
| sql> drop table result_table; |
| sql> select pca_train( |
| 'mat', -- name of the input table |
| 'result_table', -- name of the output table |
| 'row_id', -- column containing the matrix indices |
| 3 -- Number of PCA components to compute |
| ); |
| @endverbatim |
| -# View the PCA results: |
| @verbatim |
| sql> SELECT * from result_table; |
| row_id | principal_components | eigen_values |
| --------+--------------------------------------------------------------+---------------------- |
| 0 | {0.707106781186547,0.408248290459781,-0.577350269192513} | 2 |
| 2 | {-0.707106781186547,0.408248290459781,-0.577350269192512} | 1.26294130828989e-08 |
| 1 | {2.08166817117217e-17,-0.816496580931809,-0.577350269183852} | 0.816496580927726 |
| |
| @endverbatim |
| |
| @anchor seealso |
| @sa File pca.sql_in documenting the SQL functions |
| @sa grp_pca_project |
| |
| @anchor background_pca |
| @par Technical Background |
| |
| The PCA implemented here uses an SVD decomposition implementation to recover |
| the principle components (as opposed to the directly computing the eigenvectors |
| of the covariance matrix). Let \f$ \boldsymbol X \f$ be the data matrix, and |
| let \f$ \hat{x} \f$ be a vector of the column averages of \f$ \boldsymbol{X}\f$. |
| PCA computes the matrix \f$ \hat{\boldsymbol X} \f$ as |
| \f[ |
| \hat{\boldsymbol X} = {\boldsymbol X} - \vec{e} \hat{x}^T |
| \f] |
| where \f$ \vec{e} \f$ is the vector of all ones. |
| |
| PCA then computes the SVD matrix factorization |
| \f[ |
| \hat{\boldsymbol X} = {\boldsymbol U}{\boldsymbol \Sigma}{\boldsymbol V}^T |
| \f] |
| where \f$ {\boldsymbol \Sigma} \f$ is a diagonal matrix. The eigenvalues are |
| recovered as the entries of \f$ {\boldsymbol \Sigma}/(\sqrt{N-1}) \f$, and the principle |
| components are the rows of \f$ {\boldsymbol V} \f$. |
| |
| |
| It is important to note that the PCA implementation assumes that the user will |
| use only the principle components that have non-zero eigenvalues. The SVD |
| calculation is done with the Lanczos method, with does not guarantee |
| correctness for singular vectors with zero-valued eigenvalues. Consequently, |
| principle components with zero-valued eigenvalues are not guaranteed to be correct. |
| Generally, this will not be problem unless the user wants to use the |
| principle components for the entire eigenspectrum. |
| |
| |
| |
| |
| @anchor literature |
| @literature |
| |
| [1] Principal Component Analysis. http://en.wikipedia.org/wiki/Principal_component_analysis |
| |
| [2] Shlens, Jonathon (2009), A Tutorial on Principal Component Analysis |
| |
| **/ |
| |
| -- ----------------------------------------------------------------------- |
| -- PCA for Dense matrices |
| -- ----------------------------------------------------------------------- |
| /* |
| @brief Compute principal components for a dense matrix stored in a |
| database table |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca, pca) |
| $$ LANGUAGE plpythonu; |
| |
| -- Overloaded functions for optional parameters |
| -- ----------------------------------------------------------------------- |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER,-- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER,-- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN -- If True correlation matrix is used for principal components |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, $7, NULL) |
| $$ LANGUAGE SQL; |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER,-- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER -- The number of Lanczos iterations for the SVD calculation |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, False , NULL) |
| $$ LANGUAGE SQL; |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER,-- Number of principal components to compute |
| grouping_cols TEXT -- Comma-separated list of grouping columns |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, 0, False , NULL) |
| $$ LANGUAGE SQL; |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER -- Number of principal components to compute |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, NULL, 0, False, NULL) |
| $$ LANGUAGE SQL; |
| |
| |
| -- Information Functions |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_train( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`pca', `pca') |
| return pca.pca_help_message(schema_madlib, usage_string) |
| $$ LANGUAGE plpythonu; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_train() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.pca_train(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |
| |
| -- ----------------------------------------------------------------------- |
| -- PCA for Sparse matrices |
| -- ----------------------------------------------------------------------- |
| /* |
| @brief Compute principal components for a sparse matrix stored in a |
| database table |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Name of 'row_id' column in sparse matrix representation |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of eigenvectors with dominant eigenvalues, sorted decreasingly |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca, pca_sparse) |
| $$ LANGUAGE plpythonu; |
| |
| |
| -- Overloaded functions for optional parameters |
| -- ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN -- If True correlation matrix is used for principal components |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, NULL) |
| $$ LANGUAGE SQL; |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER -- The number of Lanczos iterations for the SVD calculation |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, False , NULL) |
| $$ LANGUAGE SQL; |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT -- Comma-separated list of grouping columns |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, 0, False , NULL) |
| $$ LANGUAGE SQL; |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER -- Number of principal components to compute |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, NULL, 0, False, NULL) |
| $$ LANGUAGE SQL; |
| |
| |
| -- ----------------------------------------------------------------------- |
| -- Information Functions |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_sparse_train( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`pca', `pca') |
| return pca.pca_sparse_help_message(schema_madlib, usage_string) |
| $$ LANGUAGE plpythonu; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_sparse_train() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.pca_sparse_train(''); |
| END; |
| $$ LANGUAGE plpgsql VOLATILE; |