blob: a12da7e54c2ba7c2711d1a0f668e66c792217714 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @file logistic.sql_in
*
* @brief SQL functions for logistic regression
* @date January 2011
*
* @sa For a brief introduction to logistic regression, see the
* module description \ref grp_logreg.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
DROP TYPE IF EXISTS MADLIB_SCHEMA.__logregr_simple_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.__logregr_simple_result AS (
coef DOUBLE PRECISION[],
log_likelihood DOUBLE PRECISION,
std_err DOUBLE PRECISION[],
z_stats DOUBLE PRECISION[],
p_values DOUBLE PRECISION[],
odds_ratios DOUBLE PRECISION[],
vcov DOUBLE PRECISION[],
condition_no DOUBLE PRECISION,
status INTEGER,
num_processed BIGINT,
num_iterations INTEGER
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_simple_step_transition(
DOUBLE PRECISION[],
BOOLEAN, /*+ y */
DOUBLE PRECISION[], /*+ x */
DOUBLE PRECISION[]) /*+ previous state */
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME', 'logregr_simple_step_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_simple_step_merge_states(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME', 'logregr_simple_step_merge_states'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_simple_step_final(
state DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME', 'logregr_simple_step_final'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
/**
* @internal
* @brief Perform one iteration of the conjugate-gradient method for computing
* logistic regression
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__logregr_simple_step(
BOOLEAN, DOUBLE PRECISION[], DOUBLE PRECISION[]);
CREATE AGGREGATE MADLIB_SCHEMA.__logregr_simple_step(
/*+ y */ BOOLEAN,
/*+ x */ DOUBLE PRECISION[],
/*+ previous_state */ DOUBLE PRECISION[]) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.__logregr_simple_step_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__logregr_simple_step_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__logregr_simple_step_final,
INITCOND='{0,0,0,0,0,0}'
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_simple_step_distance(
/*+ state1 */ DOUBLE PRECISION[],
/*+ state2 */ DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME', 'internal_logregr_simple_step_distance'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__logregr_simple_finalizer(
/*+ state */ DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.__logregr_simple_result
AS 'MODULE_PATHNAME', 'internal_logregr_simple_result'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
/**
* @brief Compute logistic-regression coefficients and diagnostic statistics
*
* To include an intercept in the model, set one coordinate in the
* <tt>independentVariables</tt> array to 1.
*
* @param source_table Name of the source relation containing the training data
* @param out_table Name of the output relation to store the model results
*
* Columns of the output relation are as follows:
* - <tt>coef FLOAT8[]</tt> - Array of coefficients, \f$ \boldsymbol c \f$
* - <tt>log_likelihood FLOAT8</tt> - Log-likelihood \f$ l(\boldsymbol c) \f$
* - <tt>std_err FLOAT8[]</tt> - Array of standard errors,
* \f$ \mathit{se}(c_1), \dots, \mathit{se}(c_k) \f$
* - <tt>z_stats FLOAT8[]</tt> - Array of Wald z-statistics, \f$ \boldsymbol z \f$
* - <tt>p_values FLOAT8[]</tt> - Array of Wald p-values, \f$ \boldsymbol p \f$
* - <tt>odds_ratios FLOAT8[]</tt>: Array of odds ratios,
* \f$ \mathit{odds}(c_1), \dots, \mathit{odds}(c_k) \f$
* - <tt>condition_no FLOAT8</tt> - The condition number of
* matrix \f$ X^T A X \f$ during the iteration
* immediately <em>preceding</em> convergence
* (i.e., \f$ A \f$ is computed using the coefficients
* of the previous iteration)
* @param dependent_varname Name of the dependent column (of type BOOLEAN)
* @param independent_varname Name of the independent column (of type DOUBLE
* PRECISION[])
* @param max_iter The maximum number of iterations
* @param tolerance The difference between log-likelihood values in successive
* iterations that should indicate convergence. This value should be
* non-negative and a zero value here disables the convergence criterion,
* and execution will only stop after \c maxNumIterations iterations.
* @param verbose If true, any error or warning message will be printed to the
* console (irrespective of the 'client_min_messages' set by server).
* If false, no error/warning message is printed to console.
*
*
* @usage
* - Get vector of coefficients \f$ \boldsymbol c \f$ and all diagnostic
* statistics:\n
* <pre>SELECT logregr_train('<em>sourceName</em>', '<em>outName</em>'
* '<em>dependentVariable</em>', '<em>independentVariables</em>');
* SELECT * from outName;
* </pre>
* - Get vector of coefficients \f$ \boldsymbol c \f$:\n
* <pre>SELECT coef from outName;</pre>
* - Get a subset of the output columns, e.g., only the array of coefficients
* \f$ \boldsymbol c \f$, the log-likelihood of determination
* \f$ l(\boldsymbol c) \f$, and the array of p-values \f$ \boldsymbol p \f$:
* <pre>SELECT coef, log_likelihood, p_values FROM outName;</pre>
*
* @note This function starts an iterative algorithm. It is not an aggregate
* function. Source, output, and column names have to be passed as strings
* (due to limitations of the SQL syntax).
*
* @internal
* @sa This function is a wrapper for logistic::compute_logregr(), which
* sets the default values.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_simple_train (
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
max_iter INTEGER,
tolerance DOUBLE PRECISION,
verbose BOOLEAN
) RETURNS VOID AS $$
PythonFunction(hello_world, simple_logistic, logregr_simple_train)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_simple_train (
source_table VARCHAR,
out_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR)
RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.logregr_simple_train($1, $2, $3, $4, 20, 0.0001, False);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
------------------------------------------------------------------------
-- Help messages -------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_simple_train(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(hello_world, simple_logistic, logregr_simple_help_msg)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.logregr_simple_train()
RETURNS TEXT
AS $$
SELECT MADLIB_SCHEMA.logregr_simple_train(NULL::TEXT);
$$ LANGUAGE SQL IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-------------------------------------------------------------