blob: a2926b66a5194d9d7be13ea368570da8484bd375 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file correlation.sql_in
*
* @brief SQL functions for correlation computation
* @date April 2013
*
* @sa For a brief introduction to correlation, see the
* module description \ref grp_correlation
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_correlation
@about
A correlation function is the degree and direction of association of two
variables; how well can one random variable be predicted from the other. The
coefficient of correlation varies from -1 to 1. Coefficient of 1 implies perfect
correlation, 0 means no correlation, and -1 means perfect anti-correlation.
This function provides a cross-correlation matrix for all pairs of numeric
columns in a <em>source_table</em>. A Correlation matrix describes correlation
among \f$ M \f$ variables. It is a square symmetrical \f$ M \f$x \f$M \f$ matrix
with the \f$ (ij) \f$th element equal to the correlation coefficient between the
\f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of
variables with themselves) are always equal to 1.0.
@usage
Currently the correlation function can be used in the following way:
@verbatim
SELECT {schema_madlib}.correlation
(
source_table TEXT, -- Source table name (Required)
output_table TEXT, -- Output table name (Required)
target_cols TEXT, -- Comma separated columns for which summary is desired
-- (Default: NULL - produces result for all columns)
)
@endverbatim
Output will be a table with N+2 columns and N rows, where N is the number of
target columns.
- column_position: The first column provides position of the variable in
the '<em>source_table</em>'
- variable: The second column gives the row-header for each
variable
The rest of the table is the NxN correlation matrix for all pairs of
numeric columns in `<em>source_table</em>'.
The output table is arranged as a lower-traingular matrix with the upper
triangle set to NULL and the diagonal elements set to 1.0. To obtain the result
from the '<em>output_table</em>' in this matrix format ensure to order the
elements using the '<em>column_position</em>', as given in the example below.
@verbatim
sql> SELECT * FROM output_table order by column_position;
@endverbatim
@examp
@verbatim
DROP TABLE IF EXISTS example_data;
CREATE TABLE example_data(
id SERIAL,
outlook text,
temperature float8,
humidity float8,
windy text,
class text) ;
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' ');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', '');
SELECT madlib.correlation('example_data', 'example_data_output');
SELECT madlib.correlation('example_data', 'example_data_output', '*');
SELECT madlib.correlation('example_data', 'example_data_output', 'temperature, humidity');
@endverbatim
To get the correlation matrix from output table:
@verbatim
SELECT * from example_data_output order by column_position;
@endverbatim
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.correlation_result;
CREATE TYPE MADLIB_SCHEMA.correlation_result AS
(
output_table TEXT,
row_count INT4,
duration FLOAT8
);
-----------------------------------------------------------------------
-- Main function for correlation
-----------------------------------------------------------------------
/* @brief Compute a correlation matrix for a table with optional target columns specified
@param source_table Name of source relation containing the data
@param output_table Name of output table name to store the correlation
@param target_cols String with comma separated list of columns for which cross-correlation is desired
@usage
<pre> SELECT MADLIB_SCHEMA.correlation (
'<em>source_table</em>', '<em>output_table</em>',
'<em>target_cols</em>'
);
SELECT * FROM '<em>output_table</em>' order by '<em>colum_position</em>';
</pre>
*/
CREATE FUNCTION MADLIB_SCHEMA.correlation(
source_table VARCHAR, -- input table name
output_table VARCHAR, -- output table name
target_cols VARCHAR -- comma separated list of output cols
-- (default = '*')
)
RETURNS MADLIB_SCHEMA.correlation_result AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation(schema_madlib, source_table, output_table, target_cols)
$$ LANGUAGE plpythonu VOLATILE;
-----------------------------------------------------------------------
-- Overloaded function
-----------------------------------------------------------------------
CREATE FUNCTION MADLIB_SCHEMA.correlation(
source_table VARCHAR, -- input table name
output_table VARCHAR -- output table name
)
RETURNS MADLIB_SCHEMA.correlation_result AS $$
select MADLIB_SCHEMA.correlation($1, $2, NULL)
$$ LANGUAGE sql;
-----------------------------------------------------------------------
-- Help functions
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
input_message TEXT
)
RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation_help_message(schema_madlib, input_message)
$$ LANGUAGE plpythonu;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation()
RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation_help_message(schema_madlib, None)
$$ LANGUAGE plpythonu;
-------------------------------------------------------------------------