| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file correlation.sql_in |
| * |
| * @brief SQL functions for correlation computation |
| * @date April 2013 |
| * |
| * @sa For a brief introduction to correlation, see the |
| * module description \ref grp_correlation |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_correlation |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#usage">Covariance and Correlation Functions</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Generates a covariance or Pearson correlation matrix for pairs |
| of numeric columns in a table. |
| |
| A correlation function is the degree and direction of association of two |
| variables—how well one random variable can be predicted from the other. |
| It is a normalized version of covariance. |
| The Pearson correlation coefficient is used here, which has a value between -1 and 1, |
| where 1 implies total positive linear correlation, 0 means no linear correlation, |
| and -1 means total negative linear correlation. |
| |
| This function generates an \f$N\f$x\f$N\f$ cross correlation matrix for |
| pairs of numeric columns in a <em>source_table</em>. It is square symmetrical |
| with the \f$ (i,j) \f$th element equal to the correlation coefficient between the |
| \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of |
| variables with themselves) are always equal to 1.0. |
| |
| We also provide a covariance function which is similar in nature to correlation, |
| and is a measure of the joint variability of two random variables. |
| |
| @anchor usage |
| @par Covariance and Correlation Functions |
| |
| The correlation function has the following syntax: |
| <pre class="syntax"> |
| correlation( source_table, |
| output_table, |
| target_cols, |
| verbose, |
| grouping_cols, |
| n_groups_per_run |
| ) |
| </pre> |
| |
| The covariance function has a similar syntax: |
| <pre class="syntax"> |
| covariance( source_table, |
| output_table, |
| target_cols, |
| verbose, |
| grouping_cols, |
| n_groups_per_run |
| ) |
| </pre> |
| |
| <dl class="arglist"> |
| |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the input data.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the table containing the cross correlation matrix. |
| The output table has N rows, where N is the number |
| of '<em>target_cols</em>' in the '<em>source_table'</em> for which |
| correlation or covariance is being computed. |
| It has the following columns: |
| <table class="output"> |
| <tr> |
| <th>column_position</th> |
| <td>An automatically generated sequential counter indicating the order of the |
| variable in the '<em>output_table</em>'.</td> |
| </tr> |
| <tr> |
| <th>variable</th> |
| <td>Contains the row header for the variables of interest.</td> |
| </tr> |
| <tr> |
| <th>grouping_cols</th> |
| <td>Contains the grouping columns, if any.</td> |
| </tr> |
| <tr> |
| <th><...></th> |
| <td>The remainder of the table is the NxN correlation matrix for the pairs of |
| variables of interest.</td> |
| </tr> |
| </table> |
| The output table is arranged as a lower-triangular matrix with the upper |
| triangle set to NULL and the diagonal elements set to 1.0. To obtain the result |
| from the '<em>output_table</em>' order by '<em>column_position</em>': |
| <pre class="example"> |
| SELECT * FROM output_table ORDER BY column_position; |
| </pre> |
| |
| In addition to output table, a summary table named \<output_table\>_summary |
| is also created, which has the following columns: |
| <table class="output"> |
| <tr><th>method</th><td>'Correlation' or 'Covariance'</td></tr> |
| <tr><th>source_table</th><td>VARCHAR. Data source table name.</td></tr> |
| <tr><th>output_table</th><td>VARCHAR. Output table name.</td></tr> |
| <tr><th>column_names</th><td>VARCHAR. Column names used for correlation |
| computation, as a comma-separated string.</td></tr> |
| <tr><th>grouping_cols</th> |
| <td>Contains the grouping columns, if any.</td></tr> |
| <tr><th>mean_vector</th><td>FLOAT8[]. Mean value of column |
| for variables of interest.</td></tr> |
| <tr> |
| <th>total_rows_processed</th> |
| <td>BIGINT. Total numbers of rows processed.</td> |
| </tr> |
| </table> |
| </dd> |
| |
| <dt>target_cols (optional)</dt> |
| <dd>TEXT, default: '*'. A comma-separated list of the columns to correlate. |
| If NULL or <tt>'*'</tt>, results are produced for all numeric columns.</dd> |
| |
| <dt>verbose (optional)</dt> |
| <dd>BOOLEAN, default: FALSE. Print verbose information if TRUE.</dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default: NULL. A comma-separated list of the columns to group by.</dd> |
| |
| <dt>n_groups_per_run (optional)</dt> |
| <dd>INTEGER, default: 10. Number of groups to process at a time. |
| This parameter is ignored if 'grouping_cols' is not specified. |
| Generally the default value will work fine, but there may be cases |
| (see below) where you will want to experiment with it |
| to reduce execution time and memory usage. |
| </dd> |
| @note |
| This is a lower level parameter that can potentially be used to |
| improve performance, but should be used with caution. |
| It is designed to handle the case where you have a large number |
| of groups. |
| In general, increasing 'n_groups_per_run' means we |
| construct a larger 'UNION ALL' query which uses more memory and may slow down execution |
| if it gets too big. |
| If you have a large number of groups and a smaller data size, there may |
| be benefits to increasing this value. |
| Conversely, decreasing 'n_groups_per_run' means we issue |
| more 'plpy.execute' commands. This increases overhead and can modestly |
| affect the execution time. |
| </dl> |
| |
| |
| @anchor examples |
| @examp |
| |
| -# Create an input dataset. |
| <pre class="example"> |
| DROP TABLE IF EXISTS example_data CASCADE; |
| CREATE TABLE example_data( |
| id SERIAL, |
| outlook TEXT, |
| temperature FLOAT8, |
| humidity FLOAT8, |
| windy TEXT, |
| class TEXT, |
| day TEXT |
| ); |
| INSERT INTO example_data VALUES |
| (1, 'sunny', 85, 85, 'false', 'Dont Play', 'Mon'), |
| (2, 'sunny', 80, 90, 'true', 'Dont Play', 'Mon'), |
| (3, 'overcast', 83, 78, 'false', 'Play', 'Mon'), |
| (4, 'rain', 70, 96, 'false', 'Play', 'Mon'), |
| (5, 'rain', 68, 80, 'false', 'Play', 'Mon'), |
| (6, 'rain', 65, 70, 'true', 'Dont Play', 'Mon'), |
| (7, 'overcast', 64, 65, 'true', 'Play', 'Mon'), |
| (8, 'sunny', 72, 95, 'false', 'Dont Play', 'Mon'), |
| (9, 'sunny', 69, 70, 'false', 'Play', 'Mon'), |
| (10, 'rain', 75, 80, 'false', 'Play', 'Mon'), |
| (11, 'sunny', 75, 70, 'true', 'Play', 'Mon'), |
| (12, 'overcast', 72, 90, 'true', 'Play', 'Mon'), |
| (13, 'overcast', 81, 75, 'false', 'Play', 'Mon'), |
| (14, 'rain', 71, 80, 'true', 'Dont Play', 'Mon'), |
| (15, NULL, 100, 100, 'true', NULL, 'Mon'), |
| (16, NULL, 110, 100, 'true', NULL, 'Mon'), |
| (101, 'sunny', 85, 85, 'false', 'Dont Play', 'Tues'), |
| (102, 'sunny', 80, 90, 'true', 'Dont Play', 'Tues'), |
| (103, 'overcast', 83, 78, 'false', 'Play', 'Tues'), |
| (104, 'rain', 70, 96, 'false', 'Play', 'Tues'), |
| (105, 'rain', 68, 80, 'false', 'Play', 'Tues'), |
| (106, 'rain', 65, 70, 'true', 'Dont Play', 'Tues'), |
| (107, 'overcast', 64, 65, 'true', 'Play', 'Tues'), |
| (108, 'sunny', 72, 95, 'false', 'Dont Play', 'Tues'), |
| (109, 'sunny', 69, 70, 'false', 'Play', 'Tues'), |
| (110, 'rain', 75, 80, 'false', 'Play', 'Tues'), |
| (111, 'sunny', 75, 70, 'true', 'Play', 'Tues'), |
| (112, 'overcast', 72, 90, 'true', 'Play', 'Tues'), |
| (113, 'overcast', 81, 75, 'false', 'Play', 'Tues'), |
| (114, 'rain', 71, 80, 'true', 'Dont Play', 'Tues'), |
| (115, NULL, 100, 100, 'true', NULL, 'Tues'), |
| (116, NULL, 110, 100, 'true', NULL, 'Tues'), |
| (201, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'), |
| (202, 'sunny', 80, 90, 'true', 'Dont Play', 'Wed'), |
| (203, 'overcast', 83, 78, 'false', 'Play', 'Wed'), |
| (204, 'rain', 70, 96, 'false', 'Play', 'Wed'), |
| (205, 'rain', 68, 80, 'false', 'Play', 'Wed'), |
| (206, 'rain', 65, 70, 'true', 'Dont Play', 'Wed'), |
| (207, 'overcast', 64, 65, 'true', 'Play', 'Wed'), |
| (208, 'sunny', 7, 95, 'false', 'Dont Play', 'Wed'), |
| (209, 'sunny', 6, 70, 'false', 'Play', 'Wed'), |
| (210, 'rain', 7, 80, 'false', 'Play', 'Wed'), |
| (211, 'sunny', 75, 70, 'true', 'Play', 'Wed'), |
| (212, 'overcast', 72, 90, 'true', 'Play', 'Wed'), |
| (213, 'overcast', 81, 75, 'false', 'Play', 'Wed'), |
| (214, 'rain', 71, 80, 'true', 'Dont Play', 'Wed'), |
| (215, NULL, 10, 100, 'true', NULL, 'Wed'), |
| (216, NULL, 10, 100, 'true', NULL, 'Wed'), |
| (217, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'), |
| (218, 'sunny', 80, 9, 'true', 'Dont Play', 'Wed'), |
| (219, 'overcast', 83, 78, 'false', 'Play', 'Wed'), |
| (220, 'rain', 70, 9, 'false', 'Play', 'Wed'), |
| (221, 'rain', 68, 80, 'false', 'Play', 'Wed'); |
| </pre> |
| |
| -# Get correlation between temperature and humidity: |
| <pre class="example"> |
| DROP TABLE IF EXISTS example_data_output, example_data_output_summary; |
| SELECT madlib.correlation( 'example_data', |
| 'example_data_output', |
| 'temperature, humidity' |
| ); |
| </pre> |
| View the correlation matrix: |
| <pre class="example"> |
| SELECT * FROM example_data_output ORDER BY column_position; |
| </pre> |
| <pre class="result"> |
| column_position | variable | temperature | humidity |
| -----------------+-------------+---------------------+---------- |
| 1 | temperature | 1 | |
| 2 | humidity | 0.00607993890408995 | 1 |
| (2 rows) |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM example_data_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]--------+----------------------------------- |
| method | Correlation |
| source | example_data |
| output_table | example_data_output |
| column_names | {temperature,humidity} |
| mean_vector | {70.188679245283,79.8679245283019} |
| total_rows_processed | 53 |
| </pre> |
| |
| -# Correlation with grouping by day: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS example_data_output, example_data_output_summary; |
| SELECT madlib.correlation( 'example_data', |
| 'example_data_output', |
| 'temperature, humidity', |
| FALSE, |
| 'day' |
| ); |
| </pre> |
| View the correlation matrix by group: |
| <pre class="example"> |
| SELECT * FROM example_data_output ORDER BY day, column_position; |
| </pre> |
| <pre class="result"> |
| column_position | variable | day | temperature | humidity |
| -----------------+-------------+------+-------------------+---------- |
| 1 | temperature | Mon | 1 | |
| 2 | humidity | Mon | 0.616876934548786 | 1 |
| 1 | temperature | Tues | 1 | |
| 2 | humidity | Tues | 0.616876934548786 | 1 |
| 1 | temperature | Wed | 1 | |
| 2 | humidity | Wed | -0.28969669368457 | 1 |
| (6 rows) |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM example_data_output_summary ORDER BY day; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]--------+------------------------------------ |
| method | Correlation |
| source | example_data |
| output_table | example_data_output |
| column_names | {temperature,humidity} |
| day | Mon |
| mean_vector | {77.5,82.75} |
| total_rows_processed | 16 |
| -[ RECORD 2 ]--------+------------------------------------ |
| method | Correlation |
| source | example_data |
| output_table | example_data_output |
| column_names | {temperature,humidity} |
| day | Tues |
| mean_vector | {77.5,82.75} |
| total_rows_processed | 16 |
| -[ RECORD 3 ]--------+------------------------------------ |
| method | Correlation |
| source | example_data |
| output_table | example_data_output |
| column_names | {temperature,humidity} |
| day | Wed |
| mean_vector | {59.0476190476191,75.4761904761905} |
| total_rows_processed | 21 |
| </pre> |
| |
| -# Get covariance between temperature and humidity: |
| <pre class="example"> |
| \\x off |
| DROP TABLE IF EXISTS example_data_output, example_data_output_summary; |
| SELECT madlib.covariance( 'example_data', |
| 'example_data_output', |
| 'temperature, humidity' |
| ); |
| </pre> |
| View the covariance matrix: |
| <pre class="example"> |
| SELECT * FROM example_data_output ORDER BY column_position; |
| </pre> |
| <pre class="result"> |
| column_position | variable | temperature | humidity |
| -----------------+-------------+------------------+------------------ |
| 1 | temperature | 507.926664293343 | |
| 2 | humidity | 2.40227839088644 | 307.359914560342 |
| (2 rows) |
| </pre> |
| View the summary table: |
| <pre class="example"> |
| \\x on |
| SELECT * FROM example_data_output_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]--------+----------------------------------- |
| method | Covariance |
| source | example_data |
| output_table | example_data_output |
| column_names | {temperature,humidity} |
| mean_vector | {70.188679245283,79.8679245283019} |
| total_rows_processed | 53 |
| </pre> |
| |
| @par Notes |
| |
| Null values will be replaced by the mean of their respective columns (mean |
| imputation/substitution). Mean imputation is a method in which the missing |
| value on a certain variable is replaced by the mean of the available cases. |
| This method maintains the sample size and is easy to use, but the variability |
| in the data is reduced, so the standard deviations and the variance estimates |
| tend to be underestimated. Please refer to [1] and [2] for details. |
| |
| If the mean imputation method is not suitable for the target use case, it is |
| advised to employ a view that handles the NULL values prior to calling the |
| correlation/covariance functions. |
| |
| @anchor literature |
| @literature |
| |
| [1] https://en.wikipedia.org/wiki/Imputation_(statistics) |
| |
| [2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/ |
| |
| @anchor related |
| @par Related Topics |
| |
| File correlation.sql_in documenting the SQL functions |
| |
| \ref grp_summary for general descriptive statistics for a table |
| |
| */ |
| |
| ----------------------------------------------------------------------- |
| -- Aggregate function for correlation |
| ----------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_transition( |
| state double precision[], |
| x double precision[], |
| mean double precision[] |
| ) RETURNS double precision[] AS |
| 'MODULE_PATHNAME', 'correlation_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_merge( |
| left_state double precision[], |
| right_state double precision[] |
| ) RETURNS double precision[] AS |
| 'MODULE_PATHNAME', 'correlation_merge_states' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_final( |
| state double precision[] |
| ) RETURNS double precision[] AS |
| 'MODULE_PATHNAME', 'correlation_final' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.correlation_agg( |
| double precision[], double precision[]); |
| CREATE AGGREGATE MADLIB_SCHEMA.correlation_agg( |
| /* x */ double precision[], |
| /* mean */ double precision[] |
| ) ( |
| SType = double precision[], |
| SFunc = MADLIB_SCHEMA.correlation_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.correlation_merge,') |
| FinalFunc = MADLIB_SCHEMA.correlation_final |
| -- use NULL as the initial value |
| ); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.covariance_agg( |
| double precision[], double precision[]); |
| CREATE AGGREGATE MADLIB_SCHEMA.covariance_agg( |
| /* x */ double precision[], |
| /* mean */ double precision[] |
| ) ( |
| SType = double precision[], |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.correlation_merge,') |
| SFunc = MADLIB_SCHEMA.correlation_transition |
| -- use NULL as the initial value |
| -- return the last transition or merge state as the final state |
| -- this aggregate does not divide by the number of samples |
| -- (hence it's sum of (x-mean)^2 instead of expectation) |
| ); |
| |
| ----------------------------------------------------------------------- |
| -- Main function for correlation |
| ----------------------------------------------------------------------- |
| /* @brief Compute a correlation matrix for a table with optional target columns specified |
| |
| @param source_table Name of source relation containing the data |
| @param output_table Name of output table name to store the correlation |
| @param target_cols String with comma separated list of columns for which cross-correlation is desired |
| @param verbose Flag to determine verbosity |
| |
| @usage |
| <pre> SELECT MADLIB_SCHEMA.correlation ( |
| '<em>source_table</em>', '<em>output_table</em>', |
| '<em>target_cols</em>' |
| ); |
| SELECT * FROM '<em>output_table</em>' ORDER BY '<em>colum_position</em>'; |
| </pre> |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar, -- comma separated list of output cols (default = '*') |
| verbose boolean, -- flag to determine verbosity |
| grouping_cols varchar, -- comma separated column names to be used for grouping |
| n_groups_per_run integer -- number of groups to process at a time |
| ) RETURNS TEXT AS $$ |
| PythonFunction(stats, correlation, correlation) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| -- Overloaded functions |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar, -- comma separated list of output cols (default = '*') |
| verbose boolean, -- flag to determine verbosity |
| grouping_cols varchar -- comma separated column names to be used for grouping |
| ) |
| RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.correlation($1, $2, $3, $4, $5, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar, -- comma separated list of output cols (default = '*') |
| verbose boolean -- flag to determine verbosity |
| ) |
| RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.correlation($1, $2, $3, $4, NULL, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar -- comma separated list of output cols (default = '*') |
| ) |
| RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.correlation($1, $2, $3, FALSE, NULL, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table varchar, -- input table name |
| output_table varchar -- output table name |
| ) RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.correlation($1, $2, NULL, FALSE, NULL, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| -- Help functions |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| input_message text |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| with AOControl(False): |
| return correlation.correlation_help_message(schema_madlib, input_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation() |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| with AOControl(False): |
| return correlation.correlation_help_message(schema_madlib, None) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |
| |
| ----------------------------------------------------------------------- |
| -- Main function for covariance |
| ----------------------------------------------------------------------- |
| /* @brief Compute a covariance matrix for a table with optional target columns specified |
| |
| @param source_table Name of source relation containing the data |
| @param output_table Name of output table name to store the correlation |
| @param target_cols String with comma separated list of columns for which cross-correlation is desired |
| @param verbose Flag to determine verbosity |
| |
| @usage |
| <pre> SELECT MADLIB_SCHEMA.covariance ( |
| '<em>source_table</em>', '<em>output_table</em>', |
| '<em>target_cols</em>' |
| ); |
| SELECT * FROM '<em>output_table</em>' ORDER BY '<em>colum_position</em>'; |
| </pre> |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar, -- comma separated list of output cols (default = '*') |
| verbose boolean, -- flag to determine verbosity |
| grouping_cols varchar, -- comma separated column names to be used for grouping |
| n_groups_per_run integer -- number of groups to process at a time |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| with AOControl(False): |
| return correlation.correlation(schema_madlib, source_table, output_table, |
| target_cols, grouping_cols, True, verbose, |
| n_groups_per_run) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| -- Overloaded functions |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar, -- comma separated list of output cols (default = '*') |
| verbose BOOLEAN, -- flag to determine verbosity |
| grouping_cols varchar -- comma separated column names to be used for grouping |
| |
| ) |
| RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.covariance($1, $2, $3, $4, $5, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar, -- comma separated list of output cols (default = '*') |
| verbose BOOLEAN -- flag to determine verbosity |
| ) |
| RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.covariance($1, $2, $3, $4, NULL, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance( |
| source_table varchar, -- input table name |
| output_table varchar, -- output table name |
| target_cols varchar -- comma separated list of output cols (default = '*') |
| ) |
| RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.covariance($1, $2, $3, FALSE, NULL, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance( |
| source_table varchar, -- input table name |
| output_table varchar -- output table name |
| ) RETURNS TEXT AS $$ |
| select MADLIB_SCHEMA.covariance($1, $2, NULL, FALSE, NULL, 10) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| -- Help functions |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance( |
| input_message text |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| with AOControl(False): |
| return correlation.correlation_help_message(schema_madlib, input_message, cov=True) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance() |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| with AOControl(False): |
| return correlation.correlation_help_message(schema_madlib, None, cov=True) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| ------------------------------------------------------------------------- |