| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file pca.sql_in |
| * |
| * @brief Principal Component Analysis |
| * |
| * @sa For a brief introduction to Principal Component Analysis, see the module |
| * description \ref grp_pca. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| |
| @addtogroup grp_pca_train |
| |
| @brief Produces a model that transforms a number of (possibly) correlated |
| variables into a (smaller) number of uncorrelated variables called principal |
| components. |
| |
| <div class ="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#notes">Notes</a></li> |
| <li class="level1"><a href="#background_pca">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| Principal component analysis (PCA) is a mathematical procedure that uses an |
| orthogonal transformation to convert a set of observations of possibly |
| correlated variables into a set of values of linearly uncorrelated variables |
| called principal components. This transformation is defined in such a way that |
| the first principal component has the largest possible variance (i.e., |
| accounts for as much of the variability in the data as possible), and each |
| succeeding component in turn has the highest variance possible under the |
| constraint that it be orthogonal to (i.e., uncorrelated with) the preceding |
| components. |
| |
| See the \ref background_pca "Technical Background" for an introduction to |
| principal component analysis. |
| |
| @anchor train |
| @par Training Function |
| The training functions are slightly different for dense and sparse matrices. For dense matrices: |
| <pre class="syntax"> |
| pca_train( source_table, |
| out_table, |
| row_id, |
| components_param, |
| grouping_cols, |
| lanczos_iter, |
| use_correlation, |
| result_summary_table |
| ) |
| </pre> |
| For sparse matrices: |
| <pre class="syntax"> |
| pca_sparse_train( source_table, |
| out_table, |
| row_id, |
| col_id, -- Sparse matrices only |
| val_id, -- Sparse matrices only |
| row_dim, -- Sparse matrices only |
| col_dim, -- Sparse matrices only |
| components_param, |
| grouping_cols, |
| lanczos_iter, |
| use_correlation, |
| result_summary_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>TEXT. Name of the input table containing the data for PCA training. |
| The input data matrix should have \f$ N \f$ rows |
| and \f$ M \f$ columns, where \f$ N \f$ is the number of data points, and \f$ M |
| \f$ is the number of features for each data point. |
| |
| A dense input table is expected to be in the one of the |
| two standard MADlib dense matrix formats, and a sparse input table |
| should be in the standard MADlib sparse matrix format. |
| |
| The two standard MADlib dense matrix formats are: |
| <pre>{TABLE|VIEW} <em>source_table</em> ( |
| <em>row_id</em> INTEGER, |
| <em>row_vec</em> FLOAT8[], |
| )</pre> |
| and |
| <pre>{TABLE|VIEW} <em>source_table</em> ( |
| <em>row_id</em> INTEGER, |
| <em>col1</em> FLOAT8, |
| <em>col2</em> FLOAT8, |
| ... |
| )</pre> |
| |
| Note that the column name <em>row_id</em> is taken as an input parameter, |
| and should contain a continguous list of row indices (starting at 1) for the input matrix. |
| |
| The input table for sparse PCA is expected to be in the form: |
| |
| <pre>{TABLE|VIEW} <em>source_table</em> ( |
| ... |
| <em>row_id</em> INTEGER, |
| <em>col_id</em> INTEGER, |
| <em>val_id</em> FLOAT8, |
| ... |
| )</pre> |
| |
| The <em>row_id</em> and <em>col_id</em> columns specify which entries |
| in the matrix are nonzero, and the <em>val_id</em> column defines the values |
| of the nonzero entries. |
| |
| Please refer to the \ref grp_matrix documentation for more details on defining matrices. |
| </DD> |
| |
| <DT>out_table</DT> |
| <DD>TEXT. The name of the table that will contain the output. |
| There are three possible output tables as described below. |
| |
| The primary output table (<em>out_table</em>) encodes the principal components |
| with the <em>k</em> highest eigenvalues where <em>k</em> is either directly |
| provided by the user or computed according to the proportion of variance. |
| The table has the following columns: |
| <table class="output"> |
| <tr> |
| <th>row_id</th> |
| <td>Eigenvalue rank in descending order of the eigenvalue size.</td> |
| </tr> |
| <tr> |
| <th>principal_components</th> |
| <td>Vectors containing elements of the principal components.</td> |
| </tr> |
| <tr> |
| <th>std_dev</th> |
| <td>The standard deviation of each principal component.</td> |
| </tr> |
| <tr> |
| <th>proportion</th> |
| <td>The proportion of variance covered by the principal component.</td> |
| </tr> |
| </table> |
| |
| The table <em>out_table_mean</em> contains the column means. |
| This table has just one column: |
| <table class="output"> |
| <tr> |
| <th>column_mean</th> |
| <td>A vector containing the column means for the input matrix.</td> |
| </tr> |
| </table> |
| |
| The optional table <em>result_summary_table</em> contains information about the performance of the PCA. The contents of this table are described under the <em>result_summary_table</em> argument. |
| </DD> |
| |
| <DT>row_id</DT> |
| <DD>TEXT. Column name containing the row IDs in the input source table. The column |
| should be of type INT (or a type that can be cast to INT) and should only contain values between 1 and |
| <em>N</em>. For dense matrix format, it should contain all continguous integers from 1 to <em>N</em> |
| describing the full matrix.</DD> |
| |
| <DT>col_id</DT> |
| <DD>TEXT. Column name containing the column IDs in sparse matrix representation. |
| The column should be of type INT (or a type that can be cast to INT) |
| and should only contain values between 1 and <em>M</em>. <em>This parameter applies to |
| sparse matrices only.</em></DD> |
| |
| <DT>val_id</DT> |
| <DD>TEXT. Name of 'val_id' column in sparse matrix representation defining the values of the nonzero entries. |
| <em>This parameter applies to sparse matrices only.</em></DD> |
| |
| <DT>row_dim</DT> |
| <DD>INTEGER. The actual number of rows in the matrix. That is, |
| if the matrix was transformed into dense format, this is the number of rows |
| it would have. |
| <em>This parameter applies to sparse matrices only.</em></DD> |
| |
| <DT>col_dim</DT> |
| <DD>INTEGER. The actual number of columns in the matrix. That is, |
| if the matrix was transformed into dense format, this is the number of columns |
| it would have. |
| <em>This parameter applies to sparse matrices only.</em></DD> |
| |
| @note The parameters 'row_dim' and 'col_dim' could actually be inferred from the |
| sparse matrix representation, so they will be removed in the future. |
| For now they are maintained for backward compatability so you must enter them. |
| Making 'row_dim' or 'col_dim' larger than the actual matrix has the effect of padding it with |
| zeros, which is probably not useful. |
| |
| <DT>components_param</DT> |
| <DD>INTEGER or FLOAT. The parameter to control the number of principal |
| components to calculate from the input data. If 'components_param' is INTEGER, |
| it is used to denote the number of principal components (<em>k</em>) to |
| compute. If 'components_param' is FLOAT, the algorithm will return enough |
| principal vectors so that the ratio of the sum of the eigenvalues collected |
| thus far to the sum of all eigenvalues is greater than this parameter |
| (proportion of variance). The value of 'components_param' must be either |
| a positive INTEGER or a FLOAT in the range (0.0,1.0]</DD> |
| |
| @note The difference in interpretation between INTEGER and FLOAT was |
| introduced to maintain backward campatibility after the proportion of |
| variance feature was introduced. A special case to be aware of: |
| 'components_param' = 1 (INTEGER) will return 1 principal |
| component, but 'components_param' = 1.0 (FLOAT) will return all |
| principal components, i.e., proportion of variance of 100%. |
| \n \n |
| Also, please note that the number of principal components (<em>k</em>) |
| is global, even in the case where grouping is used (see 'grouping_cols' |
| below). In the case of grouping, proportion of variance |
| might be a better choice; this could result in different numbers |
| of principal components for different groups. |
| |
| <DT>grouping_cols (optional)</DT> |
| <DD>TEXT, default: NULL. A comma-separated list of column names, with the |
| source data grouped using the combination of all the columns. An independent |
| PCA model will be computed for each combination of the grouping columns.</DD> |
| |
| @note Dense matrices can be different sizes for different groups if desired. |
| Sparse matrices cannot be different sizes for different groups, |
| because the 'row_dim' and 'col_dim' parameters used for sparse matrices |
| are global across all groups. |
| |
| <DT>lanczos_iter (optional)</DT> |
| <DD>INTEGER, default: minimum of {<em>k+40</em>, smallest matrix dimension} |
| where <em>k</em> is the number of principal components specified in the |
| parameter 'components_param'. This parameter defines the |
| number of Lanczos iterations for the SVD calculation. |
| The Lanczos iteration number roughly corresponds to the accuracy of the SVD |
| calculation, and a higher number of iterations corresponds to greater accuracy |
| but longer computation time. The number of iterations must be at least as |
| large as the value of <em>k</em>, but no larger than the smallest dimension |
| of the matrix. If the number of iterations is set to zero, then the default |
| number of iterations will be used.</DD> |
| |
| @note If both 'lanczos_iter' and proportion of variance (via |
| the 'components_param' parameter) are defined, 'lanczos_iter' |
| will take precedence in determining the number of |
| principal components (i.e. the number of principal components |
| will not be greater than 'lanczos_iter' even if the target |
| proportion had not been reached). |
| |
| <DT>use_correlation (optional)</DT> |
| <DD>BOOLEAN, default FALSE. Whether to use the correlation matrix for calculating the principal components instead of the covariance matrix. Currently |
| <em>use_correlation</em> is a placeholder for forward compatibility, so |
| this value must be set to false.</DD> |
| |
| <DT>result_summary_table (optional)</DT> |
| <DD>TEXT, default NULL. Name of the optional summary table. When NULL, no summary table is generated. |
| |
| This sumary table has the following columns: |
| <table class="output"> |
| <tr> |
| <th>rows_used</th> |
| <td>INTEGER. Number of data points in the input.</td> |
| </tr> |
| <tr> |
| <th>exec_time (ms)</th> |
| <td>FLOAT8. Number of milliseconds for the PCA calculation to run.</td> |
| </tr> |
| <tr> |
| <th>iter</th> |
| <td>INTEGER. Number of iterations used in the SVD calculation.</td> |
| </tr> |
| <tr> |
| <th>recon_error</th> |
| <td>FLOAT8. The absolute error in the SVD approximation.</td> |
| </tr> |
| <tr> |
| <th>relative_recon_error</th> |
| <td>FLOAT8. The relative error in the SVD approximation.</td> |
| </tr> |
| <tr> |
| <th>use_correlation</th> |
| <td>BOOLEAN. Indicates if the correlation matrix was used.</td> |
| </tr> |
| </table> |
| </DD> |
| </DL> |
| |
| |
| @anchor examples |
| @examp |
| |
| -# View online help for the PCA training functions: |
| <pre class="example"> |
| SELECT madlib.pca_train(); |
| or |
| SELECT madlib.pca_sparse_train(); |
| </pre> |
| |
| -# Create sample data in dense matrix form: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat; |
| CREATE TABLE mat (id integer, |
| row_vec double precision[] |
| ); |
| INSERT INTO mat VALUES |
| (1, '{1,2,3}'), |
| (2, '{2,1,2}'), |
| (3, '{3,2,1}'); |
| </pre> |
| |
| -# Run the PCA function for a specified number of principal components and view the results: |
| <pre class="example"> |
| DROP TABLE IF EXISTS result_table, result_table_mean; |
| SELECT madlib.pca_train('mat', -- Source table |
| 'result_table', -- Output table |
| 'id', -- Row id of source table |
| 2); -- Number of principal components |
| SELECT * FROM result_table ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion |
| --------+--------------------------------------------------------------+-------------------+------------------- |
| 1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} | 1.41421356237309 | 0.857142857142244 |
| 2 | {0,1,0} | 0.577350269189626 | 0.142857142857041 |
| (2 rows) |
| </pre> |
| |
| -# Run the PCA function for a specified proportion of variance and view the results: |
| <pre class="example"> |
| %%sql |
| DROP TABLE IF EXISTS result_table, result_table_mean; |
| SELECT madlib.pca_train('mat', -- Source table |
| 'result_table', -- Output table |
| 'id', -- Row id of source table |
| 0.9); -- Proportion of variance |
| SELECT * FROM result_table ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion |
| --------+--------------------------------------------------------------+-------------------+------------------- |
| 1 | {0.707106781186548,-2.77555756156289e-17,-0.707106781186548} | 1.4142135623731 | 0.857142857142245 |
| 2 | {-1.11022302462516e-16,-1,0} | 0.577350269189626 | 0.142857142857041 |
| (2 rows) |
| </pre> |
| |
| -# Now we use grouping in dense form to learn different models for different groups. |
| First, we create sample data in dense matrix form with a grouping column. |
| Note we actually have different matrix sizes for the different groups, which |
| is allowed for dense: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_group; |
| CREATE TABLE mat_group ( |
| id integer, |
| row_vec double precision[], |
| matrix_id integer |
| ); |
| INSERT INTO mat_group VALUES |
| (1, '{1,2,3}', 1), |
| (2, '{2,1,2}', 1), |
| (3, '{3,2,1}', 1), |
| (4, '{1,2,3,4,5}', 2), |
| (5, '{2,5,2,4,1}', 2), |
| (6, '{5,4,3,2,1}', 2); |
| </pre> |
| |
| -# Run the PCA function with grouping for a specified proportion of variance and view the results: |
| <pre class="example"> |
| DROP TABLE IF EXISTS result_table_group, result_table_group_mean; |
| SELECT madlib.pca_train('mat_group', -- Source table |
| 'result_table_group', -- Output table |
| 'id', -- Row id of source table |
| 0.8, -- Proportion of variance |
| 'matrix_id'); -- Grouping column |
| SELECT * FROM result_table_group ORDER BY matrix_id, row_id; |
| </pre> |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion | matrix_id |
| --------+------------------------------------------------------------------------------------------------+-----------------+-------------------+----------- |
| 1 | {0.707106781186548,0,-0.707106781186547} | 1.4142135623731 | 0.857142857142245 | 1 |
| 1 | {-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023} | 3.2315220311722 | 0.764102534485173 | 2 |
| 2 | {0.587384101786277,-0.485138064894743,0.311532046315153,-0.449458074050715,0.347212037159181} | 1.795531127192 | 0.235897465516047 | 2 |
| (3 rows) |
| </pre> |
| |
| -# Now let's look at sparse matrices. Create sample data in sparse matrix form: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_sparse; |
| CREATE TABLE mat_sparse ( |
| row_id integer, |
| col_id integer, |
| value double precision |
| ); |
| INSERT INTO mat_sparse VALUES |
| (1, 1, 1.0), |
| (2, 2, 2.0), |
| (3, 3, 3.0), |
| (4, 4, 4.0), |
| (1, 5, 5.0), |
| (2, 4, 6.0), |
| (3, 2, 7.0), |
| (4, 3, 8.0); |
| </pre> |
| As an aside, this is what the sparse matrix above looks like when |
| put in dense form: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_dense; |
| SELECT madlib.matrix_densify('mat_sparse', |
| 'row=row_id, col=col_id, val=value', |
| 'mat_dense'); |
| SELECT * FROM mat_dense ORDER BY row_id; |
| </pre> |
| <pre class="result"> |
| row_id | value |
| --------+------------- |
| 1 | {1,0,0,0,5} |
| 2 | {0,2,0,6,0} |
| 3 | {0,7,3,0,0} |
| 4 | {0,0,8,4,0} |
| (4 rows) |
| </pre> |
| |
| -# Run the PCA sparse function for a specified number of principal components and view the results: |
| <pre class="example">DROP TABLE IF EXISTS result_table, result_table_mean; |
| SELECT madlib.pca_sparse_train( 'mat_sparse', -- Source table |
| 'result_table', -- Output table |
| 'row_id', -- Row id of source table |
| 'col_id', -- Column id of source table |
| 'value', -- Value of matrix at row_id, col_id |
| 4, -- Actual number of rows in the matrix |
| 5, -- Actual number of columns in the matrix |
| 3); -- Number of principal components |
| SELECT * FROM result_table ORDER BY row_id; |
| </pre> |
| Result (with principal components truncated for readability): |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion |
| --------+----------------------------------------------+------------------+------------------- |
| 1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 0.436590030617467 |
| 2 | {-0.0647272661608605,0.877639526308692,... | 3.68408023747461 | 0.333748701544697 |
| 3 | {-0.0780380267884855,0.177956517174911,... | 3.05606908060098 | 0.229661267837836 |
| (3 rows) |
| </pre> |
| |
| -# Now we use grouping in sparse form to learn different models for different groups. |
| First, we create sample data in sparse matrix form with a grouping column: |
| <pre class="example"> |
| DROP TABLE IF EXISTS mat_sparse_group; |
| CREATE TABLE mat_sparse_group ( |
| row_id integer, |
| col_id integer, |
| value double precision, |
| matrix_id integer); |
| INSERT INTO mat_sparse_group VALUES |
| (1, 1, 1.0, 1), |
| (2, 2, 2.0, 1), |
| (3, 3, 3.0, 1), |
| (4, 4, 4.0, 1), |
| (1, 5, 5.0, 1), |
| (2, 4, 6.0, 2), |
| (3, 2, 7.0, 2), |
| (4, 3, 8.0, 2); |
| </pre> |
| |
| -# Run the PCA function with grouping for a specified proportion of variance |
| and view the results: |
| <pre class="example"> |
| DROP TABLE IF EXISTS result_table_group, result_table_group_mean; |
| SELECT madlib.pca_sparse_train( 'mat_sparse_group', -- Source table |
| 'result_table_group', -- Output table |
| 'row_id', -- Row id of source table |
| 'col_id', -- Column id of source table |
| 'value', -- Value of matrix at row_id, col_id |
| 4, -- Actual number of rows in the matrix |
| 5, -- Actual number of columns in the matrix |
| 0.8, -- Proportion of variance |
| 'matrix_id'); |
| SELECT * FROM result_table_group ORDER BY matrix_id, row_id; |
| </pre> |
| Result (with principal components truncated for readability): |
| <pre class="result"> |
| row_id | principal_components | std_dev | proportion | matrix_id |
| --------+--------------------------------------------+------------------+-------------------+----------- |
| 1 | {-0.17805696611353,0.0681313257646983,... | 2.73659933165925 | 0.544652792875481 | 1 |
| 2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 0.308800210823714 | 1 |
| 1 | {0,-0.479486114660443,... | 4.40325305087975 | 0.520500333693473 | 2 |
| 2 | {0,0.689230898585949,... | 3.7435566458567 | 0.376220573442628 | 2 |
| (4 rows) |
| </pre> |
| |
| @anchor notes |
| @par Notes |
| |
| - Table names can be optionally schema qualified (current_schemas() would be |
| searched if a schema name is not provided) and all table and column names |
| should follow case-sensitivity and quoting rules per the database. |
| (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. |
| 'mytable'. |
| If mixed-case or multi-byte characters are desired for entity names then the |
| string should be double-quoted; in this case the input would be '"MyTable"'). |
| |
| - Because of the centering step in PCA (see |
| \ref background_pca "Technical Background"), sparse matrices almost always |
| become dense during the training process. Since this implementation |
| automatically densifies sparse matrix input, there should be no expected |
| performance improvement in using sparse matrix input over dense matrix input. |
| |
| - For the parameter 'components_param', INTEGER and FLOAT are |
| interpreted differently. A special case to be aware of: |
| 'components_param' = 1 (INTEGER) will return 1 principal |
| component, but 'components_param' = 1.0 (FLOAT) will return all |
| principal components, i.e., proportion of variance of 100%. |
| |
| - If both 'lanczos_iter' and proportion of variance (via |
| the 'components_param' parameter) are defined, 'lanczos_iter' |
| will take precedence in determining the number of |
| principal components (i.e. the number of principal components |
| will not be greater than 'lanczos_iter' even if the target |
| proportion had not been reached). |
| |
| @anchor background_pca |
| @par Technical Background |
| |
| The PCA implemented here uses a distributed SVD decomposition implementation to recover |
| the principal components (as opposed to the directly computing the eigenvectors |
| of the covariance matrix). Let \f$ \boldsymbol X \f$ be the data matrix, and |
| let \f$ \hat{x} \f$ be a vector of the column averages of \f$ \boldsymbol{X}\f$. |
| PCA computes the matrix \f$ \hat{\boldsymbol X} \f$ as |
| \f[ |
| \hat{\boldsymbol X} = {\boldsymbol X} - \vec{e} \hat{x}^T |
| \f] |
| where \f$ \vec{e} \f$ is the vector of all ones. |
| |
| PCA then computes the SVD matrix factorization |
| \f[ |
| \hat{\boldsymbol X} = {\boldsymbol U}{\boldsymbol \Sigma}{\boldsymbol V}^T |
| \f] |
| where \f$ {\boldsymbol \Sigma} \f$ is a diagonal matrix. The eigenvalues are |
| recovered as the entries of \f$ {\boldsymbol \Sigma}/(\sqrt{(N-1)} \f$, and the principal |
| components are the rows of \f$ {\boldsymbol V} \f$. The reasoning behind using N − 1 instead of N to calculate the covariance is <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a>. |
| |
| |
| @note It is important to note that this PCA implementation assumes that the user will |
| use only the principal components that have non-zero eigenvalues. The SVD |
| calculation is done with the Lanczos method, which does not guarantee |
| correctness for singular vectors with zero-valued eigenvalues. Consequently, |
| principal components with zero-valued eigenvalues are not guaranteed to be correct. |
| Generally, this will not be problem unless the user wants to use the |
| principal components for the entire eigenspectrum. |
| |
| |
| @anchor literature |
| @literature |
| |
| [1] Principal Component Analysis. http://en.wikipedia.org/wiki/Principal_component_analysis |
| |
| [2] Shlens, Jonathon (2009), A Tutorial on Principal Component Analysis |
| |
| |
| |
| @anchor related |
| @par Related Topics |
| |
| File pca.sql_in documenting the SQL functions |
| |
| \ref grp_pca_project |
| |
| */ |
| |
| -- ----------------------------------------------------------------------- |
| -- PCA for Dense matrices |
| -- ----------------------------------------------------------------------- |
| /* |
| @brief Compute principal components for a dense matrix stored in a |
| database table |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT, -- Table name to store summary of results (Default: NULL) |
| variance DOUBLE PRECISION -- The proportion of variance (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca, pca) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| -- Overloaded functions for optional parameters |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, $7, $8, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER,-- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER,-- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN -- If True correlation matrix is used for principal components |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, $7, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER,-- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER -- The number of Lanczos iterations for the SVD calculation |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, False , NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER,-- Number of principal components to compute |
| grouping_cols TEXT -- Comma-separated list of grouping columns |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, 0, False , NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER -- Number of principal components to compute |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, NULL, 0, False, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| variance DOUBLE PRECISION, -- Proportion of variance |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER,-- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, NULL, $5, $6, $7, $8, $4) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| variance DOUBLE PRECISION, -- Proportion of variance |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER,-- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN -- If True correlation matrix is used for principal components |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, NULL, $5, $6, $7, NULL, $4) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| variance DOUBLE PRECISION, -- Proportion of variance |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER -- The number of Lanczos iterations for the SVD calculation |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, NULL, $5, $6, False , NULL, $4) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| variance DOUBLE PRECISION, -- Proportion of variance |
| grouping_cols TEXT -- Comma-separated list of grouping columns |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, NULL, $5, 0, False , NULL, $4) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| variance DOUBLE PRECISION -- Proportion of variance |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, NULL, NULL, 0, False, NULL, $4) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Information Functions |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_train( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`pca', `pca') |
| return pca.pca_help_message(schema_madlib, usage_string) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_train() |
| RETURNS VARCHAR AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.pca_train(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| -- ----------------------------------------------------------------------- |
| -- PCA for Sparse matrices |
| -- ----------------------------------------------------------------------- |
| /* |
| @brief Compute principal components for a sparse matrix stored in a |
| database table |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Name of 'row_id' column in sparse matrix representation |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of eigenvectors with dominant eigenvalues, sorted decreasingly |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT, -- Table name to store summary of results (Default: NULL) |
| variance DOUBLE PRECISION -- The proportion of variance (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca, pca_sparse) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| -- Overloaded functions for optional parameters |
| -- ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Name of 'row_id' column in sparse matrix representation |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of eigenvectors with dominant eigenvalues, sorted decreasingly |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN -- If True correlation matrix is used for principal components |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER -- The number of Lanczos iterations for the SVD calculation |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, False , NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT -- Comma-separated list of grouping columns |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, $9, 0, False , NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| k INTEGER -- Number of principal components to compute |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, $8, NULL, 0, False, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Name of 'row_id' column in sparse matrix representation |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| variance DOUBLE PRECISION, -- proportion of variance |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT -- Table name to store summary of results (Default: NULL) |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, NULL, $9, $10, $11, $12, $8) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| variance DOUBLE PRECISION, -- proportion of variance |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation |
| use_correlation BOOLEAN -- If True correlation matrix is used for principal components |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, NULL, $9, $10, $11, NULL, $8) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| variance DOUBLE PRECISION, -- proportion of variance |
| grouping_cols TEXT, -- Comma-separated list of grouping columns |
| lanczos_iter INTEGER -- The number of Lanczos iterations for the SVD calculation |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, NULL, $9, $10, False , NULL, $8) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| variance DOUBLE PRECISION, -- proportion of variance |
| grouping_cols TEXT -- Comma-separated list of grouping columns |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, NULL, $9, 0, False , NULL, $8) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.pca_sparse_train( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| col_id TEXT, -- Name of 'col_id' column in sparse matrix representation |
| val_id TEXT, -- Name of 'val_id' column in sparse matrix representation |
| row_dim INTEGER, -- Number of rows in the sparse matrix |
| col_dim INTEGER, -- Number of columns in the sparse matrix |
| variance DOUBLE PRECISION -- proportion of variance |
| ) |
| RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pca_sparse_train($1, $2, $3, $4, $5, $6, $7, NULL, NULL, 0, False, NULL, $8) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| -- ----------------------------------------------------------------------- |
| -- Information Functions |
| -- ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_sparse_train( |
| usage_string VARCHAR -- usage string |
| ) |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`pca', `pca') |
| return pca.pca_sparse_help_message(schema_madlib, usage_string) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pca_sparse_train() |
| RETURNS TEXT AS $$ |
| BEGIN |
| RETURN MADLIB_SCHEMA.pca_sparse_train(''); |
| END; |
| $$ LANGUAGE plpgsql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_union( |
| source_table TEXT, -- Source table name (dense matrix) |
| pc_table TEXT, -- Output table name for the principal components |
| pc_table_mean TEXT, -- Output table name for the principal components |
| row_id TEXT, -- Column name for the ID for each row |
| k INTEGER, -- Number of principal components to compute |
| grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL) |
| lanczos_iter INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension)) |
| use_correlation BOOLEAN, -- If True correlation matrix is used for principal components (Default: False) |
| result_summary_table TEXT, -- Table name to store summary of results (Default: NULL) |
| result_summary_table_temp TEXT, -- Table name to store summary of results (Default: NULL) |
| variance DOUBLE PRECISION, -- The proportion of variance (Default: NULL) |
| grp_id INTEGER, -- a place holder id for each group |
| grouping_where_clause TEXT, -- WHERE clause using grouping_cols |
| sparse_where_condition TEXT, -- WHERE clause used when creating temp sparse matrix table with dims |
| select_grouping_cols TEXT, -- SELECT clause using grouping_cols |
| temp_table_columns TEXT, -- SELECT caluse for creating temporary copy of the source_table |
| is_sparse BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices |
| col_id TEXT, -- sparse representation based detail |
| val_id TEXT, -- sparse representation based detail |
| row_dim INTEGER, -- sparse representation based detail |
| col_dim INTEGER -- sparse representation based detail |
| ) |
| RETURNS VOID AS $$ |
| PythonFunction(pca, pca, _pca_union) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |