| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file correlation.sql_in |
| * |
| * @brief SQL functions for correlation computation |
| * @date April 2013 |
| * |
| * @sa For a brief introduction to correlation, see the |
| * module description \ref grp_correlation |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_correlation |
| |
| |
| @about |
| |
| A correlation function is the degree and direction of association of two |
| variables; how well can one random variable be predicted from the other. The |
| coefficient of correlation varies from -1 to 1. Coefficient of 1 implies perfect |
| correlation, 0 means no correlation, and -1 means perfect anti-correlation. |
| |
| This function provides a cross-correlation matrix for all pairs of numeric |
| columns in a <em>source_table</em>. A Correlation matrix describes correlation |
| among \f$ M \f$ variables. It is a square symmetrical \f$ M \f$x \f$M \f$ matrix |
| with the \f$ (ij) \f$th element equal to the correlation coefficient between the |
| \f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of |
| variables with themselves) are always equal to 1.0. |
| |
| @usage |
| |
| Currently the correlation function can be used in the following way: |
| @verbatim |
| SELECT {schema_madlib}.correlation |
| ( |
| source_table TEXT, -- Source table name (Required) |
| output_table TEXT, -- Output table name (Required) |
| target_cols TEXT, -- Comma separated columns for which summary is desired |
| -- (Default: NULL - produces result for all columns) |
| ) |
| @endverbatim |
| |
| Output will be a table with N+2 columns and N rows, where N is the number of |
| target columns. |
| |
| - column_position: The first column provides position of the variable in |
| the '<em>source_table</em>' |
| - variable: The second column gives the row-header for each |
| variable |
| The rest of the table is the NxN correlation matrix for all pairs of |
| numeric columns in `<em>source_table</em>'. |
| |
| The output table is arranged as a lower-traingular matrix with the upper |
| triangle set to NULL and the diagonal elements set to 1.0. To obtain the result |
| from the '<em>output_table</em>' in this matrix format ensure to order the |
| elements using the '<em>column_position</em>', as given in the example below. |
| |
| @verbatim |
| sql> SELECT * FROM output_table order by column_position; |
| @endverbatim |
| |
| @examp |
| |
| @verbatim |
| DROP TABLE IF EXISTS example_data; |
| CREATE TABLE example_data( |
| id SERIAL, |
| outlook text, |
| temperature float8, |
| humidity float8, |
| windy text, |
| class text) ; |
| |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play'); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' '); |
| INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', ''); |
| |
| SELECT madlib.correlation('example_data', 'example_data_output'); |
| SELECT madlib.correlation('example_data', 'example_data_output', '*'); |
| SELECT madlib.correlation('example_data', 'example_data_output', 'temperature, humidity'); |
| @endverbatim |
| To get the correlation matrix from output table: |
| @verbatim |
| SELECT * from example_data_output order by column_position; |
| @endverbatim |
| */ |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.correlation_result; |
| CREATE TYPE MADLIB_SCHEMA.correlation_result AS |
| ( |
| output_table TEXT, |
| row_count INT4, |
| duration FLOAT8 |
| ); |
| |
| ----------------------------------------------------------------------- |
| -- Main function for correlation |
| ----------------------------------------------------------------------- |
| /* @brief Compute a correlation matrix for a table with optional target columns specified |
| |
| @param source_table Name of source relation containing the data |
| @param output_table Name of output table name to store the correlation |
| @param target_cols String with comma separated list of columns for which cross-correlation is desired |
| |
| @usage |
| <pre> SELECT MADLIB_SCHEMA.correlation ( |
| '<em>source_table</em>', '<em>output_table</em>', |
| '<em>target_cols</em>' |
| ); |
| SELECT * FROM '<em>output_table</em>' order by '<em>colum_position</em>'; |
| </pre> |
| */ |
| CREATE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table VARCHAR, -- input table name |
| output_table VARCHAR, -- output table name |
| target_cols VARCHAR -- comma separated list of output cols |
| -- (default = '*') |
| ) |
| RETURNS MADLIB_SCHEMA.correlation_result AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| return correlation.correlation(schema_madlib, source_table, output_table, target_cols) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| ----------------------------------------------------------------------- |
| -- Overloaded function |
| ----------------------------------------------------------------------- |
| CREATE FUNCTION MADLIB_SCHEMA.correlation( |
| source_table VARCHAR, -- input table name |
| output_table VARCHAR -- output table name |
| ) |
| RETURNS MADLIB_SCHEMA.correlation_result AS $$ |
| select MADLIB_SCHEMA.correlation($1, $2, NULL) |
| $$ LANGUAGE sql; |
| |
| ----------------------------------------------------------------------- |
| -- Help functions |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation( |
| input_message TEXT |
| ) |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| return correlation.correlation_help_message(schema_madlib, input_message) |
| $$ LANGUAGE plpythonu; |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation() |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`stats', `correlation') |
| return correlation.correlation_help_message(schema_madlib, None) |
| $$ LANGUAGE plpythonu; |
| ------------------------------------------------------------------------- |