src/ports/postgres/modules/stats/correlation.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file correlation.sql_in
  *
  * @brief SQL functions for correlation computation
  * @date April 2013
  *
  * @sa For a brief introduction to correlation, see the
  *     module description \ref grp_correlation
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')


 /**
 @addtogroup grp_correlation

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#usage">Covariance and Correlation Functions</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#literature">Literature</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>

 @brief Generates a covariance or Pearson correlation matrix for pairs
 of numeric columns in a table.

 A correlation function is the degree and direction of association of two
 variables&mdash;how well one random variable can be predicted from the other.
 It is a normalized version of covariance.
 The Pearson correlation coefficient is used here, which has a value between -1 and 1,
 where 1 implies total positive linear correlation, 0 means no linear correlation,
 and -1 means total negative linear correlation.

 This function generates an \f$N\f$x\f$N\f$ cross correlation matrix for
 pairs of numeric columns in a <em>source_table</em>. It is square symmetrical
 with the \f$ (i,j) \f$th element equal to the correlation coefficient between the
 \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of
 variables with themselves) are always equal to 1.0.

 We also provide a covariance function which is similar in nature to correlation,
 and is a measure of the joint variability of two random variables.

 @anchor usage
 @par Covariance and Correlation Functions

 The correlation function has the following syntax:
 <pre class="syntax">
 correlation( source_table,
              output_table,
              target_cols,
              verbose,
              grouping_cols,
              n_groups_per_run
            )
 </pre>

 The covariance function has a similar syntax:
 <pre class="syntax">
 covariance( source_table,
             output_table,
             target_cols,
             verbose,
             grouping_cols,
             n_groups_per_run
           )
 </pre>

 <dl class="arglist">

 <dt>source_table</dt>
 <dd>TEXT. Name of the table containing the input data.</dd>

 <dt>output_table</dt>
 <dd>TEXT. Name of the table containing the cross correlation matrix.
 The output table has N rows, where N is the number
 of '<em>target_cols</em>' in the '<em>source_table'</em> for which
 correlation or covariance is being computed.
 It has the following columns:
 <table class="output">
 <tr>
 <th>column_position</th>
 <td>An automatically generated sequential counter indicating the order of the
 variable in the '<em>output_table</em>'.</td>
 </tr>
 <tr>
 <th>variable</th>
 <td>Contains the row header for the variables of interest.</td>
 </tr>
 <tr>
 <th>grouping_cols</th>
 <td>Contains the grouping columns, if any.</td>
 </tr>
 <tr>
 <th><...></th>
 <td>The remainder of the table is the NxN correlation matrix for the pairs of
 variables of interest.</td>
 </tr>
 </table>
 The output table is arranged as a lower-triangular matrix with the upper
 triangle set to NULL and the diagonal elements set to 1.0. To obtain the result
 from the '<em>output_table</em>' order by '<em>column_position</em>':
 <pre class="example">
 SELECT * FROM output_table ORDER BY column_position;
 </pre>

 In addition to output table, a summary table named \<output_table\>_summary
 is also created, which has the following columns:
 <table class="output">
 <tr><th>method</th><td>'Correlation' or 'Covariance'</td></tr>
 <tr><th>source_table</th><td>VARCHAR. Data source table name.</td></tr>
 <tr><th>output_table</th><td>VARCHAR. Output table name.</td></tr>
 <tr><th>column_names</th><td>VARCHAR. Column names used for correlation
 computation, as a comma-separated string.</td></tr>
 <tr><th>grouping_cols</th>
 <td>Contains the grouping columns, if any.</td></tr>
 <tr><th>mean_vector</th><td>FLOAT8[]. Mean value of column
 for variables of interest.</td></tr>
 <tr>
   <th>total_rows_processed</th>
   <td>BIGINT. Total numbers of rows processed.</td>
 </tr>
 </table>
 </dd>

 <dt>target_cols (optional)</dt>
 <dd>TEXT, default: '*'. A comma-separated list of the columns to correlate.
 If NULL or <tt>'*'</tt>, results are produced for all numeric columns.</dd>

 <dt>verbose (optional)</dt>
 <dd>BOOLEAN, default: FALSE. Print verbose information if TRUE.</dd>

 <dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: NULL. A comma-separated list of the columns to group by.</dd>

 <dt>n_groups_per_run (optional)</dt>
 <dd>INTEGER, default: 10. Number of groups to process at a time.
 This parameter is ignored if 'grouping_cols' is not specified.
 Generally the default value will work fine, but there may be cases
 (see below) where you will want to experiment with it
 to reduce execution time and memory usage.
 </dd>
 @note
 This is a lower level parameter that can potentially be used to
 improve performance, but should be used with caution.
 It is designed to handle the case where you have a large number
 of groups.
 In general, increasing 'n_groups_per_run' means we
 construct a larger 'UNION ALL' query which uses more memory and may slow down execution
 if it gets too big.
 If you have a large number of groups and a smaller data size, there may
 be benefits to increasing this value.
 Conversely, decreasing 'n_groups_per_run' means we issue
 more 'plpy.execute' commands.  This increases overhead and can modestly
 affect the execution time.
 </dl>


 @anchor examples
 @examp

 -# Create an input dataset.
 <pre class="example">
 DROP TABLE IF EXISTS example_data CASCADE;
 CREATE TABLE example_data(
     id SERIAL,
     outlook TEXT,
     temperature FLOAT8,
     humidity FLOAT8,
     windy TEXT,
     class TEXT,
     day TEXT
 );
 INSERT INTO example_data VALUES
 (1, 'sunny', 85, 85, 'false', 'Dont Play', 'Mon'),
 (2, 'sunny', 80, 90, 'true', 'Dont Play', 'Mon'),
 (3, 'overcast', 83, 78, 'false', 'Play', 'Mon'),
 (4, 'rain', 70, 96, 'false', 'Play', 'Mon'),
 (5, 'rain', 68, 80, 'false', 'Play', 'Mon'),
 (6, 'rain', 65, 70, 'true', 'Dont Play', 'Mon'),
 (7, 'overcast', 64, 65, 'true', 'Play', 'Mon'),
 (8, 'sunny', 72, 95, 'false', 'Dont Play', 'Mon'),
 (9, 'sunny', 69, 70, 'false', 'Play', 'Mon'),
 (10, 'rain', 75, 80, 'false', 'Play', 'Mon'),
 (11, 'sunny', 75, 70, 'true', 'Play', 'Mon'),
 (12, 'overcast', 72, 90, 'true', 'Play', 'Mon'),
 (13, 'overcast', 81, 75, 'false', 'Play', 'Mon'),
 (14, 'rain', 71, 80, 'true', 'Dont Play', 'Mon'),
 (15, NULL, 100, 100, 'true', NULL, 'Mon'),
 (16, NULL, 110, 100, 'true', NULL, 'Mon'),
 (101, 'sunny', 85, 85, 'false', 'Dont Play', 'Tues'),
 (102, 'sunny', 80, 90, 'true', 'Dont Play', 'Tues'),
 (103, 'overcast', 83, 78, 'false', 'Play', 'Tues'),
 (104, 'rain', 70, 96, 'false', 'Play', 'Tues'),
 (105, 'rain', 68, 80, 'false', 'Play', 'Tues'),
 (106, 'rain', 65, 70, 'true', 'Dont Play', 'Tues'),
 (107, 'overcast', 64, 65, 'true', 'Play', 'Tues'),
 (108, 'sunny', 72, 95, 'false', 'Dont Play', 'Tues'),
 (109, 'sunny', 69, 70, 'false', 'Play', 'Tues'),
 (110, 'rain', 75, 80, 'false', 'Play', 'Tues'),
 (111, 'sunny', 75, 70, 'true', 'Play', 'Tues'),
 (112, 'overcast', 72, 90, 'true', 'Play', 'Tues'),
 (113, 'overcast', 81, 75, 'false', 'Play', 'Tues'),
 (114, 'rain', 71, 80, 'true', 'Dont Play', 'Tues'),
 (115, NULL, 100, 100, 'true', NULL, 'Tues'),
 (116, NULL, 110, 100, 'true', NULL, 'Tues'),
 (201, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'),
 (202, 'sunny', 80, 90, 'true', 'Dont Play', 'Wed'),
 (203, 'overcast', 83, 78, 'false', 'Play', 'Wed'),
 (204, 'rain', 70, 96, 'false', 'Play', 'Wed'),
 (205, 'rain', 68, 80, 'false', 'Play', 'Wed'),
 (206, 'rain', 65, 70, 'true', 'Dont Play', 'Wed'),
 (207, 'overcast', 64, 65, 'true', 'Play', 'Wed'),
 (208, 'sunny', 7, 95, 'false', 'Dont Play', 'Wed'),
 (209, 'sunny', 6, 70, 'false', 'Play', 'Wed'),
 (210, 'rain', 7, 80, 'false', 'Play', 'Wed'),
 (211, 'sunny', 75, 70, 'true', 'Play', 'Wed'),
 (212, 'overcast', 72, 90, 'true', 'Play', 'Wed'),
 (213, 'overcast', 81, 75, 'false', 'Play', 'Wed'),
 (214, 'rain', 71, 80, 'true', 'Dont Play', 'Wed'),
 (215, NULL, 10, 100, 'true', NULL, 'Wed'),
 (216, NULL, 10, 100, 'true', NULL, 'Wed'),
 (217, 'sunny', 85, 85, 'false', 'Dont Play', 'Wed'),
 (218, 'sunny', 80, 9, 'true', 'Dont Play', 'Wed'),
 (219, 'overcast', 83, 78, 'false', 'Play', 'Wed'),
 (220, 'rain', 70, 9, 'false', 'Play', 'Wed'),
 (221, 'rain', 68, 80, 'false', 'Play', 'Wed');
 </pre>

 -# Get correlation between temperature and humidity:
 <pre class="example">
 DROP TABLE IF EXISTS example_data_output, example_data_output_summary;
 SELECT madlib.correlation( 'example_data',
                            'example_data_output',
                            'temperature, humidity'
                          );
 </pre>
 View the correlation matrix:
 <pre class="example">
 SELECT * FROM example_data_output ORDER BY column_position;
 </pre>
 <pre class="result">
  column_position |  variable   |     temperature     | humidity
 -----------------+-------------+---------------------+----------
                1 | temperature |                   1 |
                2 | humidity    | 0.00607993890408995 |        1
 (2 rows)
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
 SELECT * FROM example_data_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+-----------------------------------
 method               | Correlation
 source               | example_data
 output_table         | example_data_output
 column_names         | {temperature,humidity}
 mean_vector          | {70.188679245283,79.8679245283019}
 total_rows_processed | 53
 </pre>

 -# Correlation with grouping by day:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS example_data_output, example_data_output_summary;
 SELECT madlib.correlation( 'example_data',
                            'example_data_output',
                            'temperature, humidity',
                            FALSE,
                            'day'
                          );
 </pre>
 View the correlation matrix by group:
 <pre class="example">
 SELECT * FROM example_data_output ORDER BY day, column_position;
 </pre>
 <pre class="result">
  column_position |  variable   | day  |    temperature    | humidity
 -----------------+-------------+------+-------------------+----------
                1 | temperature | Mon  |                 1 |
                2 | humidity    | Mon  | 0.616876934548786 |        1
                1 | temperature | Tues |                 1 |
                2 | humidity    | Tues | 0.616876934548786 |        1
                1 | temperature | Wed  |                 1 |
                2 | humidity    | Wed  | -0.28969669368457 |        1
 (6 rows)
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
 SELECT * FROM example_data_output_summary ORDER BY day;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+------------------------------------
 method               | Correlation
 source               | example_data
 output_table         | example_data_output
 column_names         | {temperature,humidity}
 day                  | Mon
 mean_vector          | {77.5,82.75}
 total_rows_processed | 16
 -[ RECORD 2 ]--------+------------------------------------
 method               | Correlation
 source               | example_data
 output_table         | example_data_output
 column_names         | {temperature,humidity}
 day                  | Tues
 mean_vector          | {77.5,82.75}
 total_rows_processed | 16
 -[ RECORD 3 ]--------+------------------------------------
 method               | Correlation
 source               | example_data
 output_table         | example_data_output
 column_names         | {temperature,humidity}
 day                  | Wed
 mean_vector          | {59.0476190476191,75.4761904761905}
 total_rows_processed | 21
 </pre>

 -# Get covariance between temperature and humidity:
 <pre class="example">
 \\x off
 DROP TABLE IF EXISTS example_data_output, example_data_output_summary;
 SELECT madlib.covariance( 'example_data',
                           'example_data_output',
                           'temperature, humidity'
                          );
 </pre>
 View the covariance matrix:
 <pre class="example">
 SELECT * FROM example_data_output ORDER BY column_position;
 </pre>
 <pre class="result">
  column_position |  variable   |   temperature    |     humidity
 -----------------+-------------+------------------+------------------
                1 | temperature | 507.926664293343 |
                2 | humidity    | 2.40227839088644 | 307.359914560342
 (2 rows)
 </pre>
 View the summary table:
 <pre class="example">
 \\x on
 SELECT * FROM example_data_output_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+-----------------------------------
 method               | Covariance
 source               | example_data
 output_table         | example_data_output
 column_names         | {temperature,humidity}
 mean_vector          | {70.188679245283,79.8679245283019}
 total_rows_processed | 53
 </pre>

 @par Notes

 Null values will be replaced by the mean of their respective columns (mean
 imputation/substitution). Mean imputation is a method in which the missing
 value on a certain variable is replaced by the mean of the available cases.
 This method maintains the sample size and is easy to use, but the variability
 in the data is reduced, so the standard deviations and the variance estimates
 tend to be underestimated. Please refer to [1] and [2] for details.

 If the mean imputation method is not suitable for the target use case, it is
 advised to employ a view that handles the NULL values prior to calling the
 correlation/covariance functions.

 @anchor literature
 @literature

 [1] https://en.wikipedia.org/wiki/Imputation_(statistics)

 [2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/

 @anchor related
 @par Related Topics

 File correlation.sql_in documenting the SQL functions

 \ref grp_summary for general descriptive statistics for a table

 */

 -----------------------------------------------------------------------
 -- Aggregate function for correlation
 -----------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_transition(
     state       double precision[],
     x           double precision[],
     mean        double precision[]
 ) RETURNS double precision[] AS
     'MODULE_PATHNAME', 'correlation_transition'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_merge(
     left_state  double precision[],
     right_state double precision[]
 ) RETURNS double precision[] AS
     'MODULE_PATHNAME', 'correlation_merge_states'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_final(
     state       double precision[]
 ) RETURNS double precision[] AS
     'MODULE_PATHNAME', 'correlation_final'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.correlation_agg(
     double precision[], double precision[]);
 CREATE AGGREGATE MADLIB_SCHEMA.correlation_agg(
     /* x */     double precision[],
     /* mean */  double precision[]
 ) (
     SType = double precision[],
     SFunc = MADLIB_SCHEMA.correlation_transition,
     m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.correlation_merge,')
     FinalFunc = MADLIB_SCHEMA.correlation_final
     -- use NULL as the initial value
 );

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.covariance_agg(
     double precision[], double precision[]);
 CREATE AGGREGATE MADLIB_SCHEMA.covariance_agg(
     /* x */     double precision[],
     /* mean */  double precision[]
 ) (
     SType = double precision[],
     m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.correlation_merge,')
     SFunc = MADLIB_SCHEMA.correlation_transition
     -- use NULL as the initial value
     -- return the last transition or merge state as the final state
     -- this aggregate does not divide by the number of samples
     --    (hence it's sum of (x-mean)^2 instead of expectation)
 );

 -----------------------------------------------------------------------
 -- Main function for correlation
 -----------------------------------------------------------------------
 /* @brief Compute a correlation matrix for a table with optional target columns specified

    @param source_table Name of source relation containing the data
    @param output_table Name of output table name to store the correlation
    @param target_cols  String with comma separated list of columns for which cross-correlation is desired
    @param verbose      Flag to determine verbosity

    @usage
    <pre> SELECT MADLIB_SCHEMA.correlation (
          '<em>source_table</em>', '<em>output_table</em>',
          '<em>target_cols</em>'
      );
      SELECT * FROM '<em>output_table</em>' ORDER BY '<em>colum_position</em>';
    </pre>
 */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     source_table varchar, -- input table name
     output_table varchar, -- output table name
     target_cols  varchar, -- comma separated list of output cols (default = '*')
     verbose      boolean, -- flag to determine verbosity
     grouping_cols varchar, -- comma separated column names to be used for grouping
     n_groups_per_run integer -- number of groups to process at a time
 ) RETURNS TEXT AS $$
     PythonFunction(stats, correlation, correlation)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 -- Overloaded functions
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar,  -- comma separated list of output cols (default = '*')
     verbose      boolean,  -- flag to determine verbosity
     grouping_cols varchar -- comma separated column names to be used for grouping
 )
 RETURNS TEXT AS $$
     select MADLIB_SCHEMA.correlation($1, $2, $3, $4, $5, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar,  -- comma separated list of output cols (default = '*')
     verbose      boolean  -- flag to determine verbosity
 )
 RETURNS TEXT AS $$
     select MADLIB_SCHEMA.correlation($1, $2, $3, $4, NULL, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar  -- comma separated list of output cols (default = '*')
 )
 RETURNS TEXT AS $$
     select MADLIB_SCHEMA.correlation($1, $2, $3, FALSE, NULL, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     source_table varchar, --  input table name
     output_table varchar  -- output table name
 ) RETURNS TEXT AS $$
     select MADLIB_SCHEMA.correlation($1, $2, NULL, FALSE, NULL, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 -- Help functions
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
     input_message       text
 ) RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`stats', `correlation')
     with AOControl(False):
         return correlation.correlation_help_message(schema_madlib, input_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation()
 RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`stats', `correlation')
     with AOControl(False):
         return correlation.correlation_help_message(schema_madlib, None)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 -------------------------------------------------------------------------

 -----------------------------------------------------------------------
 -- Main function for covariance
 -----------------------------------------------------------------------
 /* @brief Compute a covariance matrix for a table with optional target columns specified

    @param source_table Name of source relation containing the data
    @param output_table Name of output table name to store the correlation
    @param target_cols  String with comma separated list of columns for which cross-correlation is desired
    @param verbose      Flag to determine verbosity

    @usage
    <pre> SELECT MADLIB_SCHEMA.covariance (
          '<em>source_table</em>', '<em>output_table</em>',
          '<em>target_cols</em>'
      );
      SELECT * FROM '<em>output_table</em>' ORDER BY '<em>colum_position</em>';
    </pre>
 */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar, -- comma separated list of output cols (default = '*')
     verbose      boolean, -- flag to determine verbosity
     grouping_cols varchar, -- comma separated column names to be used for grouping
     n_groups_per_run integer -- number of groups to process at a time
 ) RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`stats', `correlation')
     with AOControl(False):
         return correlation.correlation(schema_madlib, source_table, output_table,
                                        target_cols, grouping_cols, True, verbose,
                                        n_groups_per_run)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 -- Overloaded functions
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar, -- comma separated list of output cols (default = '*')
     verbose BOOLEAN,       -- flag to determine verbosity
     grouping_cols varchar -- comma separated column names to be used for grouping

 )
 RETURNS TEXT AS $$
     select MADLIB_SCHEMA.covariance($1, $2, $3, $4, $5, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar, -- comma separated list of output cols (default = '*')
     verbose BOOLEAN       -- flag to determine verbosity
 )
 RETURNS TEXT AS $$
     select MADLIB_SCHEMA.covariance($1, $2, $3, $4, NULL, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     source_table varchar, --  input table name
     output_table varchar, -- output table name
     target_cols  varchar  -- comma separated list of output cols (default = '*')
 )
 RETURNS TEXT AS $$
     select MADLIB_SCHEMA.covariance($1, $2, $3, FALSE, NULL, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     source_table varchar, --  input table name
     output_table varchar  -- output table name
 ) RETURNS TEXT AS $$
     select MADLIB_SCHEMA.covariance($1, $2, NULL, FALSE, NULL, 10)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 -- Help functions
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
     input_message       text
 ) RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`stats', `correlation')
     with AOControl(False):
         return correlation.correlation_help_message(schema_madlib, input_message, cov=True)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance()
 RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`stats', `correlation')
     with AOControl(False):
         return correlation.correlation_help_message(schema_madlib, None, cov=True)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
 -------------------------------------------------------------------------