| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file regression.sql_in |
| * |
| * @brief SQL functions for multi-linear and logistic regression |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to multi-linear regression, see the module |
| * description \ref grp_linreg. Likewise, for logistic regression, see the |
| * module description \ref grp_logreg. |
| * |
| *//* -------------------------------------------------------------------------- |
| * |
| * This file is preprocessed with m4. Macro expansion can be turned of by |
| * enclosing text in <nom4> and </nom4>. |
| */ |
| changequote(`<nom4>', `</nom4>') |
| |
| /** |
| @addtogroup grp_linreg |
| |
| @about |
| |
| Linear regression refers to a stochastic model, in which the conditional mean |
| of the dependent variable (usually denoted $y$) is an affine function of the |
| vector of independent variables (usually denoted \f$ \boldsymbol x \f$): |
| \f[ |
| E[y \mid \boldsymbol x] = \boldsymbol c^T \boldsymbol x |
| \f] |
| for some unknown vector of coefficients \f$ \boldsymbol c \f$. |
| |
| We fit the model with the ordinary-least-squares method. That is, the vector of |
| regression coefficients \f$ \boldsymbol c \f$ is estimated as: |
| \f[ |
| \boldsymbol c = (X^T X)^+ X^T \boldsymbol y = X^+ \boldsymbol y |
| \f] |
| where |
| - $X$ is the design matrix with $k$ columns and $n$ rows, containing all |
| observed vectors of independent variables \f$ \boldsymbol x_i \f$ as rows |
| - $X^T$ denotes the transpose of $X$ |
| - $X^+$ dentoes the pseudo-inverse of $X$. |
| Note: The identity \f$ X^+ = (X^T X)^+ X^T \f$ holds for all matrices $X$. A |
| proof can be found, e.g., at: |
| http://en.wikipedia.org/wiki/Proofs_involving_the_Moore%2DPenrose_pseudoinverse |
| |
| Computing the <b>total sum of squares</b> $TSS$, the <b>explained |
| sum of squares</b> $ESS$ (also called the regression sum of |
| squares), and the <b>residual sum of squares</b> $RSS$ (also called sum of |
| squared residuals or sum of squared errors of prediction) is |
| done according to the following formulas: |
| \f[\begin{align*} |
| ESS & = \boldsymbol y^T X \boldsymbol c |
| - \frac{ \| y \|_1^2 }{n} \\ |
| TSS & = \sum_{i=1}^k y_i^2 |
| - \frac{ \| y \|_1^2 }{n} \\ |
| R^2 & = \frac{ESS}{TSS} |
| \end{align*}\f] |
| Note: The last equality follows from the definition |
| \f$ R^2 = 1 - \frac{RSS}{TSS} \f$ and the fact that for linear regression |
| $TSS = RSS + ESS$. A proof of the latter can be found, e.g., at: |
| http://en.wikipedia.org/wiki/Sum_of_squares |
| |
| We estimate the variance |
| \f$ Var[y - \boldsymbol c^T \boldsymbol x \mid \boldsymbol x] \f$ as |
| \f[ |
| \sigma^2 = \frac{RSS}{n - k} |
| \f] |
| and compute the t-statistic for coefficient $i$ as |
| \f[ |
| t_i = \frac{c_i}{\sqrt{\sigma^2 \cdot \left( (X^T X)^{-1} \right)_{ii} }} |
| \f] |
| |
| The $p$-value for coefficient $i$ gives the probability that the null hypothesis |
| ($c_i = 0$) is false, i.e., the probability that $c_i$ differs significantly |
| from 0. Letting \f$ F_\nu \f$ denote the cumulative density function of |
| student-t with \f$ \nu \f$ degrees of freedom, the $p$-value for coefficient $i$ |
| is therefore |
| \f[ |
| p_i = P(|T| \geq |t_i|) = 2 \cdot (1 - F_{n - k}( |t_i| )) |
| \f] |
| where $T$ is a student-t distributed random variable with mean 0. |
| |
| @prereq |
| |
| Implemented in C for PostgreSQL/Greenplum. |
| |
| @usage |
| |
| -# The data set is expected to be of the following form:\n |
| <tt>{TABLE|VIEW} <em>sourceName</em> ([...] <em>dependentVariable</em> |
| DOUBLE PRECISION, <em>independentVariables</em> DOUBLE PRECISION[], |
| [...])</tt> |
| -# Run the linear regression by:\n |
| <tt>SELECT mregr_coef(<em>dependentVariable</em>, |
| <em>independentVariables</em>) FROM <em>sourceName</em></tt>\n |
| Note: In order to model an intercept, set one coordinate in the |
| <tt>independentVariables</tt> array to 1. (See below for an example.) |
| \n |
| -# The coefficient of determination (also denoted $R^2$), the vector of |
| t-statistics, and the vector of p-values can be determined likewise by |
| mregr_r2(), mregr_tstats(), mregr_pvalues(). |
| |
| @examp |
| |
| The following examples is taken from |
| http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf. |
| |
| @verbatim |
| # select * from houses; |
| id | tax | bedroom | bath | price | size | lot |
| ----+------+---------+------+--------+------+------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 |
| (15 rows) |
| |
| # select mregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses; |
| mregr_coef |
| ------------------------------------ |
| {27923.4,-35524.8,2269.34,130.794} |
| (1 row) |
| |
| # select mregr_r2(price, array[1, bedroom, bath, size])::REAL from houses; |
| mregr_r2 |
| ---------- |
| 0.745374 |
| (1 row) |
| |
| # select mregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses; |
| mregr_tstats |
| -------------------------------------- |
| {0.495919,-1.41891,0.102183,3.61223} |
| (1 row) |
| |
| # select mregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses; |
| mregr_pvalues |
| ----------------------------------------- |
| {0.629711,0.183633,0.920451,0.00408159} |
| (1 row) |
| @endverbatim |
| |
| @sa file regression.sql_in (documenting the SQL functions) |
| |
| @internal |
| @sa file regress.c (documenting the implementation in C), function |
| float8_mregr_compute() (documenting the formulas used for coefficients, |
| $R^2$, t-statistics, and p-values, implemented in C) |
| @endinternal |
| |
| @literature |
| |
| [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 21 October |
| 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/17/lecture-17.pdf |
| |
| */ |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_accum(state DOUBLE PRECISION[], y DOUBLE PRECISION, x DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.float8_mregr_combine(state1 DOUBLE PRECISION[], state2 DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| -- Final functions |
| CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_coef(DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_r2(DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_tstats(DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_mregr_pvalues(DOUBLE PRECISION[]) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| |
| /** |
| * @brief Compute multi-linear regression coefficients. |
| * |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @return Array of coefficients, which has the same length as the array of |
| * independent variables. |
| * |
| * @examp <tt>SELECT mregr_coef(y, [1, x1, x2]) FROM data;</tt> |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mregr_coef( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.float8_mregr_accum, |
| STYPE=float8[], |
| FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef, |
| ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>) |
| INITCOND='{0}' |
| ); |
| |
| /** |
| * @brief Compute the coefficient of determination, $R^2$. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.float8_mregr_accum, |
| STYPE=float8[], |
| FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2, |
| ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>) |
| INITCOND='{0}' |
| ); |
| |
| /** |
| * @brief Compute the vector of t-statistics, for every coefficient. |
| * |
| * To include an intercept in the model, set one coordinate in the |
| * independentVariables array to 1. |
| * |
| * @param dependentVariable Dependent variable |
| * @param independentVariables Array of independent variables |
| * @return Array of t-statistics for each coefficient. The returned array has |
| * the same length as the array of independent variables. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.float8_mregr_accum, |
| STYPE=float8[], |
| FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats, |
| ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>) |
| INITCOND='{0}' |
| ); |
| |
| /** |
| * @brief Compute the vector of p-values, for every coefficient. |
| * |
| * @param dependentVariable Dependent variable |
| * @param independentVariables Array of independent variables |
| * @return Array of p-values for each coefficient. The returned array has |
| * the same length as the array of independent variables. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mregr_pvalues( |
| /*+ "dependentVariable" */ DOUBLE PRECISION, |
| /*+ "independentVariables" */ DOUBLE PRECISION[]) ( |
| |
| SFUNC=MADLIB_SCHEMA.float8_mregr_accum, |
| STYPE=float8[], |
| FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues, |
| ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc=MADLIB_SCHEMA.float8_mregr_combine,</nom4>) |
| INITCOND='{0}' |
| ); |
| |
| /** |
| * @brief Student-t cumulative distribution function. |
| * |
| * @param nu Degree of freedom >= 1. |
| * @param x |
| */ |
| CREATE FUNCTION MADLIB_SCHEMA.student_t_cdf(nu INTEGER, x DOUBLE PRECISION) |
| RETURNS DOUBLE PRECISION |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C |
| IMMUTABLE STRICT; |
| |
| |
| /** |
| @addtogroup grp_logreg |
| |
| @about |
| |
| Logistic regression is used to estimate probabilities of a dependent binary |
| variable, by fitting a stochastic model. It is one of the most commonly used |
| tools for applied statistics and data mining [1]. |
| |
| Logistic regression assumes a generalized linear model: |
| \f[ |
| E[Y] = g^{-1}(\boldsymbol c^T X) |
| \f] |
| where: |
| - $Y$ is the dependent variable |
| - \f$\boldsymbol c^T X\f$ is the linear predictor |
| - \f$g(x) = \ln\left( \frac{x}{1-x} \right)\f$ is the link function, with |
| inverse \f$\sigma(x) := g^{-1}(x) = \frac{1}{1 + \exp(-x)} \f$ |
| |
| For each training data point $i$, we have a vector of |
| features $x_i$ and an observed class $y_i$. For ease of notation, let $Z$ be a |
| dependent random variable such that $Z = -1$ if $Y = 0$ and $Z = 1$ if $Y = 1$, |
| i.e., \f$Z := 2(Y - \frac 12)\f$. By definition, |
| \f$P[Z = z_i | X = x_i] = σ(z_i \cdot \boldsymbol c^T x_i)\f$. |
| |
| Since logistic regression predicts probabilities, we can do maximum-likelihood |
| fitting: That is, we want the vector of regression coefficients |
| \f$\boldsymbol c\f$ to maximize |
| \f[ |
| \prod_{i=1}^n \sigma(z_i \cdot \boldsymbol c^T \boldsymbol x_i) |
| \f] |
| or, equivalently, to maximize the objective |
| \f[ |
| l(\boldsymbol c) = |
| -\sum_{i=1}^n \ln(1 + \exp(-z_i \cdot \boldsymbol c^T \boldsymbol x_i)) |
| \f] |
| By looking at the Hessian, we can verify that \f$l(\boldsymbol c)\f$ is convex. |
| |
| There are many techniques for solving convex optimization problems. Currently, |
| logistic regression in MADlib can use one of two algorithms: |
| - Iteratively Reweighted Least Squares |
| - A conjugate-gradient approach, also known as Fletcher-Reeves method in the |
| literature, where we use the Hestenes-Stiefel rule for calculating the step |
| size. |
| |
| |
| @prereq |
| |
| Implemented in C (the computation) and Python (the driver/outer loop) for |
| PostgreSQL/Greenplum. |
| |
| @usage |
| |
| -# The training data is expected to be of the following form:\n |
| <tt>{TABLE|VIEW} <em>sourceName</em> ([...] <em>dependentVariable</em> |
| BOOLEAN, <em>independentVariables</em> DOUBLE PRECISION[], [...])</tt> |
| -# Run the logistic regression by:\n |
| <tt>SELECT logregr_coef('<em>sourceName</em>', '<em>dependentVariable</em>', |
| '<em>independentVariables</em>', <em>numIterations</em>, |
| '<em>optimizer</em>', <em>precision</em>)</tt>\n |
| The last three arguments are optional and can be omitted, in which case |
| default values will be used. See logregr_coef().\n |
| Note: In order to model an intercept, set one coordinate in the |
| <tt>independentVariables</tt> array to 1. |
| |
| |
| @examp |
| |
| @verbatim |
| # select * from artificiallogreg; |
| y | x |
| ---+------------------------------------------------------- |
| t | {-1.19845,1.15366,0.941779,-0.23669,-0.711024} |
| f | {-0.0680205,-0.716733,-0.149781,-0.410448,-0.0843123} |
| f | {-0.330021,0.222596,-0.976091,0.773816,-1.06238} |
| f | {0.648293,0.286225,0.524144,-0.141286,-1.41774} |
| f | {0.859484,-0.412929,-0.273368,-0.243059,0.714789} |
| [...] |
| |
| # select madlib.logregr_coef( |
| # 'artificiallogreg', 'y', 'x', 20, 'irls', 0.001 |
| # )::REAL[]; |
| logregr_coef |
| --------------------------------------------- |
| {-3.0307,3.63312,0.714105,-1.72496,1.37484} |
| @endverbatim |
| |
| @sa file regression.sql_in (documenting the SQL functions) |
| |
| @internal |
| @sa namespace logRegress (documenting the driver/outer loop implemented in |
| Python), function float8_cg_update_final() (documenting the |
| conjugate-gradient update/iteration steps, implemented in C), function |
| float8_cg_update_accum() (documenting the |
| iteratively-reweighted-least-squares update/iteration steps, implemented in |
| C) |
| @endinternal |
| |
| @literature |
| |
| A somewhat random selection of nice write-ups, with valuable pointers into |
| further literature: |
| |
| [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November |
| 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf |
| |
| [2] Thomas P. Minka: A comparison of numerical optimizers for logistic |
| regression, 2003 (revised Mar 26, 2007), |
| http://research.microsoft.com/en-us/um/people/minka/papers/logreg/minka-logreg.pdf |
| |
| [3] Paul Komarek, Andrew W. Moore: Making Logistic Regression A Core Data Mining |
| Tool With TR-IRLS, IEEE International Conference on Data Mining 2005, |
| pp. 685-688, http://komarix.org/ac/papers/tr-irls.short.pdf |
| */ |
| |
| CREATE TYPE MADLIB_SCHEMA.logregr_cg_state AS ( |
| iteration INTEGER, |
| len INTEGER, |
| coef DOUBLE PRECISION[], |
| dir DOUBLE PRECISION[], |
| grad DOUBLE PRECISION[], |
| beta DOUBLE PRECISION, |
| |
| count BIGINT, |
| gradNew DOUBLE PRECISION[], |
| dTHd DOUBLE PRECISION, |
| logLikelihood DOUBLE PRECISION |
| ); |
| |
| CREATE TYPE MADLIB_SCHEMA.logregr_irls_state AS ( |
| coef DOUBLE PRECISION[], |
| logLikelihood DOUBLE PRECISION |
| ); |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_accum( |
| MADLIB_SCHEMA.logregr_cg_state, |
| BOOLEAN, |
| DOUBLE PRECISION[], |
| MADLIB_SCHEMA.logregr_cg_state) |
| RETURNS MADLIB_SCHEMA.logregr_cg_state |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_accum( |
| DOUBLE PRECISION[], |
| BOOLEAN, |
| DOUBLE PRECISION[], |
| MADLIB_SCHEMA.logregr_irls_state) |
| RETURNS DOUBLE PRECISION[] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_final(MADLIB_SCHEMA.logregr_cg_state) |
| RETURNS MADLIB_SCHEMA.logregr_cg_state |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_final(DOUBLE PRECISION[]) |
| RETURNS MADLIB_SCHEMA.logregr_irls_state |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| /** |
| * @internal |
| * @brief Perform one iteration of the conjugate-gradient method for computing |
| * logistic regression |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step( |
| BOOLEAN, |
| DOUBLE PRECISION[], |
| MADLIB_SCHEMA.logregr_cg_state) ( |
| |
| SFUNC=MADLIB_SCHEMA.float8_cg_update_accum, |
| STYPE=MADLIB_SCHEMA.logregr_cg_state, |
| FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final |
| ); |
| |
| /** |
| * @internal |
| * @brief Perform one iteration of the iteratively-reweighted-least-squares |
| * method for computing linear regression |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step( |
| BOOLEAN, |
| DOUBLE PRECISION[], |
| MADLIB_SCHEMA.logregr_irls_state) ( |
| |
| SFUNC=MADLIB_SCHEMA.float8_irls_update_accum, |
| STYPE=float8[], |
| PREFUNC=MADLIB_SCHEMA.float8_mregr_combine, |
| FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final, |
| INITCOND='{0}' |
| ); |
| |
| CREATE FUNCTION MADLIB_SCHEMA.logregr_should_terminate( |
| DOUBLE PRECISION[], |
| DOUBLE PRECISION[], |
| VARCHAR, |
| DOUBLE PRECISION) |
| RETURNS BOOLEAN |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| -- begin functions for logistic-regression coefficients |
| -- We only need to document the last one (unfortunately, in Greenplum we have to |
| -- use function overloading instead of default arguments). |
| CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR) |
| RETURNS DOUBLE PRECISION[] AS $$ |
| import sys |
| try: |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| except: |
| sys.path.append("PLPYTHON_LIBDIR") |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| |
| return logRegress.compute_logregr_coef(**globals()) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER) |
| RETURNS DOUBLE PRECISION[] AS $$ |
| import sys |
| try: |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| except: |
| sys.path.append("PLPYTHON_LIBDIR") |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| |
| return logRegress.compute_logregr_coef(**globals()) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER, |
| "optimizer" VARCHAR) |
| RETURNS DOUBLE PRECISION[] AS $$ |
| import sys |
| try: |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| except: |
| sys.path.append("PLPYTHON_LIBDIR") |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| |
| return logRegress.compute_logregr_coef(**globals()) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| |
| /** |
| * @brief Compute logistic-regression coefficients |
| * |
| * To include an intercept in the model, set one coordinate in the |
| * <tt>independentVariables</tt> array to 1. |
| * |
| * @param source Name of the source relation containing the training data |
| * @param depColumn Name of the dependent column (of type BOOLEAN) |
| * @param indepColumn Name of the independent column (of type DOUBLE |
| * PRECISION[]) |
| * @param numIterations The maximum number of iterations |
| * @param optimizer The optimizer to use (either |
| * <tt>'ilrs'</tt>/<tt>'newton'</tt> for iteratively reweighted least |
| * squares or <tt>'cg'</tt> for conjugent gradient) |
| * @param precision The difference between log-likelihood values in successive |
| * iterations that should indicate convergence, or 0 indicating that |
| * log-likelihood values should be ignored |
| * |
| * @note This function starts an iterative algorithm. It is not an aggregate |
| * function. Source and column names have to be passed as strings (due to |
| * limitations of the SQL syntax). |
| * |
| * @examp <tt>SELECT logregr_coef('data', 'y', 'array[1, x1, x2]', 20, 'cg', |
| * 0.001);</tt> |
| * |
| * @internal |
| * @sa This function is a wrapper for logRegress::compute_logregr_coef(), which |
| * sets the default values. |
| */ |
| CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( |
| "source" VARCHAR, |
| "depColumn" VARCHAR, |
| "indepColumn" VARCHAR, |
| "numIterations" INTEGER /*+ DEFAULT 20 */, |
| "optimizer" VARCHAR /*+ DEFAULT 'irls' */, |
| "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */) |
| RETURNS DOUBLE PRECISION[] AS $$ |
| import sys |
| try: |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| except: |
| sys.path.append("PLPYTHON_LIBDIR") |
| ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import logRegress |
| |
| return logRegress.compute_logregr_coef(**globals()) |
| $$ LANGUAGE plpythonu VOLATILE; |
| |
| |
| ifdef(<nom4>PGXS</nom4>,<nom4> |
| /* |
| * Function for initializing python paths to the paths in dynamic_library_path. |
| * This is only needed when debugging Python-based functions without installing |
| * them in a location where Python would find them automatically. |
| */ |
| CREATE FUNCTION MADLIB_SCHEMA.init_python_paths() |
| RETURNS VOID AS |
| $$ |
| # FIXME: The following code should be common code and not reside in a specialized module |
| import sys |
| |
| dyld_paths = plpy.execute( |
| "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':') |
| before_default = True |
| count = 0 |
| for path in dyld_paths: |
| if path == "$libdir": |
| before_default = False |
| else: |
| if before_default: |
| sys.path.insert(count, path) |
| count += 1 |
| else: |
| sys.path.append(path) |
| $$ LANGUAGE plpythonu VOLATILE; |
| </nom4>) |