blob: 377dfea511d1fc79dfb6be17b9381347367f6247 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file glm.sql_in
*
* @brief SQL functions for GLM (Poisson)
* @date June 2014
*
* @sa For a brief introduction to GLM (Poisson), see the
* module description \ref grp_poisson.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_glm
<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#train">Training Function</a></li>
<li class="level1"><a href="#predict">Prediction Function</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
@brief Estimate generalized linear model (GLM).
GLM is a flexible generalization of ordinary linear regression that allows for
response variables that have error distribution models other than a normal
distribution. The GLM generalizes linear regression by allowing the linear
model to be related to the response variable via a link function and by
allowing the magnitude of the variance of each measurement to be a function
of its predicted value.
Generalized linear models extends ordinary linear regression by allowing the
response variable to follow a more general set of distributions (rather than
simply Gaussian distributions), and for a general family of functions of the
response variable (the link function) to vary linearly with the predicted values
(rather than assuming that the response itself must vary linearly).
For example, data of counts would typically be modeled with a Poisson
distribution and a log link, while binary outcomes would typically be modeled
with a Bernoulli distribution (or binomial distribution, depending on exactly
how the problem is phrased) and a log-odds (or logit) link function.
Currently, the implemented distribution families are
<center>
<table>
<tr>
<th>Distribution Family</th>
<th>Link Functions</th>
</tr>
<tr>
<td>Binomial</td>
<td>logit, probit</td>
</tr>
<tr>
<td>Gamma</td>
<td>inverse, identity, log</td>
</tr>
<tr>
<td>Gaussian</td>
<td>identity, inverse, log</td>
</tr>
<tr>
<td>Inverse Gaussian</td>
<td>inverse of square, inverse, identity, log</td>
</tr>
<tr>
<td>Poisson</td>
<td>log, identity, square-root<br></td>
</tr>
</table>
</center>
@anchor train
@par Training Function
GLM training function has the following format:
<pre class="syntax">
glm(source_table,
model_table,
dependent_varname,
independent_varname,
family_params,
grouping_col,
optim_params,
verbose
)
</pre>
\b Arguments
<DL class="arglist">
<DT>source_table</DT>
<DD>TEXT. The name of the table containing the training data.</DD>
<DT>model_table</DT>
<DD>TEXT. Name of the generated table containing the model.
The model table produced by glm contains the following columns:
<table class="output">
<tr>
<th>&lt;...&gt;</th>
<td>Text. Grouping columns, if provided in input. This could be multiple columns
depending on the \c grouping_col input.</td>
</tr>
<tr>
<th>coef</th>
<td>FLOAT8. Vector of the coefficients in linear predictor.</td>
</tr>
<tr>
<th>log_likelihood</th>
<td>FLOAT8. The log-likelihood \f$ l(\boldsymbol \beta) \f$. We use the maximum likelihood estimate of
dispersion parameter to calculate the log-likelihood while R and Python use deviance estimate and
Pearson estimate respectively.</td>
</tr>
<tr>
<th>std_err</th>
<td>FLOAT8[]. Vector of the standard error of the coefficients.</td>
</tr>
<tr>
<th>z_stats or t_stats</th>
<td>FLOAT8[]. Vector of the z-statistics (in Poisson distribtuion and Binomial distribution) or the t-statistics (in all other distributions) of the coefficients.</td>
</tr>
<tr>
<th>p_values</th>
<td>FLOAT8[]. Vector of the p-values of the coefficients.</td>
</tr>
<tr>
<th>dispersion</th>
<td>FLOAT8. The dispersion value (Pearson estimate). When family=poisson or family=binomial, the dispersion is
always 1.</td>
</tr>
<tr>
<th>num_rows_processed</th>
<td>BIGINT. Numbers of rows processed.</td>
</tr>
<tr>
<th>num_rows_skipped</th>
<td>BIGINT. Numbers of rows skipped due to missing values or failures.</td>
</tr>
<tr>
<th>num_iterations</th>
<td>INTEGER. The number of iterations actually completed. This would be different
from the \c nIterations argument if a \c tolerance parameter is provided and the
algorithm converges before all iterations are completed.</td>
</tr>
</table>
A summary table named \<model_table\>_summary is also created at the same time, which has the following columns:
<table class="output">
<tr>
<th>method</th>
<td>'glm'</td>
</tr>
<tr>
<th>source_table</th>
<td>The data source table name.</td>
</tr>
<tr>
<th>model_table</th>
<td>The model table name.</td>
</tr>
<tr>
<th>dependent_varname</th>
<td>The dependent variable.</td>
</tr>
<tr>
<th>independent_varname</th>
<td>The independent variables</td>
</tr>
<tr>
<th>family_params</th>
<td>A string that contains family parameters, and has the form of 'family=..., link=...'</td>
</tr>
<tr>
<th>grouping_col</th>
<td>Name of grouping columns.</td>
</tr>
<tr>
<th>optimizer_params</th>
<td>A string that contains optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'</td>
</tr>
<tr>
<th>num_all_groups</th>
<td>Number of groups in glm training.</td>
</tr>
<tr>
<th>num_failed_groups</th>
<td>Number of failed groups in glm training.</td>
</tr>
<tr>
<th>total_rows_processed</th>
<td>BIGINT. Total numbers of rows processed in all groups.</td>
</tr>
<tr>
<th>total_rows_skipped</th>
<td>BIGINT. Total numbers of rows skipped in all groups due to missing values or failures.</td>
</tr>
</table>
</DD>
<DT>dependent_varname</DT>
<DD>TEXT. Name of the dependent variable column.</DD>
<DT>independent_varname</DT>
<DD>TEXT. Expression list to evaluate for the
independent variables. An intercept variable is not assumed. It is common to
provide an explicit intercept term by including a single constant \c 1 term in
the independent variable list.</DD>
<DT>family_params (optional)</DT>
<DD>TEXT, Parameters for distribution family. Currently, we
support
(1) family=poisson and link=[log or identity or sqrt].
(2) family=gaussian and link=[identity or log or inverse].
And when family=gaussian and link=identity, the GLM model is
exactly the same as the linear regression.
(3) family=gamma and link=[inverse or identity or log].
(4) family=inverse_gaussian and link=[sqr_inverse or log or identity or inverse].
(5) family=binomial and link=[probit or logit].
</DD>
<DT>grouping_col (optional)</DT>
<DD>TEXT, default: NULL. An expression list used to group
the input dataset into discrete groups, running one regression per group.
Similar to the SQL "GROUP BY" clause. When this value is NULL, no
grouping is used and a single model is generated.</DD>
<DT>optim_params (optional)</DT>
<DD>TEXT, default: 'max_iter=100,optimizer=irls,tolerance=1e-6'.
Parameters for optimizer. Currently, we support
tolerance=[tolerance for relative error between log-likelihoods],
max_iter=[maximum iterations to run], optimizer=irls.</DD>
<DT>verbose (optional)</DT>
<DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>
</DL>
<dd>@note For p-values, we just return the computation result directly.
Other statistical packages, like 'R', produce the same result, but on printing the
result to screen, another format function is used and any p-value that is
smaller than the machine epsilon (the smallest positive floating-point number
'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the
value of the machine epsilon). Although the results may look different, they are
in fact the same.
</dd>
@anchor predict
@par Prediction Function
The prediction function is provided to estimate the conditional mean given a new
predictor. It has the following syntax:
<pre class="syntax">
glm_predict(coef,
col_ind_var
link)
</pre>
\b Arguments
<DL class="arglist">
<DT>coef</DT>
<DD>DOUBLE PRECISION[]. Model coefficients obtained from \ref glm().</DD>
<DT>col_ind_var</DT>
<DD>New predictor, as a DOUBLE array. This should be the same length
as the array obtained by evaluation of the 'independent_varname' argument in
\ref glm().</DD>
<DT>link</DT>
<DD>link function, as a string. This should match the link function the user
inputted in \ref glm().</DD>
</DL>
@anchor examples
@examp
-# Create the training data table.
<pre class="example">
CREATE TABLE warpbreaks(
id serial,
breaks integer,
wool char(1),
tension char(1)
);
INSERT INTO warpbreaks(breaks, wool, tension) VALUES
(26, 'A', 'L'),
(30, 'A', 'L'),
(54, 'A', 'L'),
(25, 'A', 'L'),
(70, 'A', 'L'),
(52, 'A', 'L'),
(51, 'A', 'L'),
(26, 'A', 'L'),
(67, 'A', 'L'),
(18, 'A', 'M'),
(21, 'A', 'M'),
(29, 'A', 'M'),
(17, 'A', 'M'),
(12, 'A', 'M'),
(18, 'A', 'M'),
(35, 'A', 'M'),
(30, 'A', 'M'),
(36, 'A', 'M'),
(36, 'A', 'H'),
(21, 'A', 'H'),
(24, 'A', 'H'),
(18, 'A', 'H'),
(10, 'A', 'H'),
(43, 'A', 'H'),
(28, 'A', 'H'),
(15, 'A', 'H'),
(26, 'A', 'H'),
(27, 'B', 'L'),
(14, 'B', 'L'),
(29, 'B', 'L'),
(19, 'B', 'L'),
(29, 'B', 'L'),
(31, 'B', 'L'),
(41, 'B', 'L'),
(20, 'B', 'L'),
(44, 'B', 'L'),
(42, 'B', 'M'),
(26, 'B', 'M'),
(19, 'B', 'M'),
(16, 'B', 'M'),
(39, 'B', 'M'),
(28, 'B', 'M'),
(21, 'B', 'M'),
(39, 'B', 'M'),
(29, 'B', 'M'),
(20, 'B', 'H'),
(21, 'B', 'H'),
(24, 'B', 'H'),
(17, 'B', 'H'),
(13, 'B', 'H'),
(15, 'B', 'H'),
(15, 'B', 'H'),
(16, 'B', 'H'),
(28, 'B', 'H');
SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 'wool,tension');
</pre>
-# Train a GLM model.
<pre class="example">
SELECT glm('warpbreaks_dummy',
'glm_model',
'breaks',
'ARRAY[1.0,"wool_B","tension_M", "tension_H"]',
'family=poisson, link=log');
</pre>
-# View the regression results.
<pre class="example">
-- Set extended display on for easier reading of output
\\x on
SELECT * FROM glm_model;
</pre>
Result:
<pre class="result">
coef | {3.69196314494079,-0.205988442638621,-0.321320431600611,-0.51848849651156}
log_likelihood | -242.527983208979
std_err | {0.04541079434248,0.0515712427835191,0.0602659166951256,0.0639595193956924}
z_stats | {81.3014438174473,-3.99425011926316,-5.3317106786264,-8.10651020224019}
p_values | {0,6.48993254938271e-05,9.72918600322907e-08,5.20943463005751e-16}
num_rows_processed | 54
num_rows_skipped | 0
iteration | 5
</pre>
Alternatively, unnest the arrays in the results for easier reading of output:
<pre class="example">
\\x off
SELECT unnest(coef) as coefficient,
unnest(std_err) as standard_error,
unnest(z_stats) as z_stat,
unnest(p_values) as pvalue
FROM glm_model;
</pre>
-# Predicting dependent variable using GLM model.
(This example uses the original data table to perform the prediction. Typically
a different test dataset with the same features as the original training dataset
would be used for prediction.)
<pre class="example">
\\x off
-- Display predicted mean value on the original dataset
SELECT
w.id,
madlib.glm_predict(
coef,
ARRAY[1, "wool_B", "tension_M", "tension_H"]::float8[],
'log') AS mu
FROM warpbreaks_dummy w, glm_model m
ORDER BY w.id;
</pre>
<pre class="example">
-- Display predicted counts (which are predicted mean values rounded to the nearest integral value) on the original dataset
SELECT
w.id,
madlib.glm_predict_poisson(
coef,
ARRAY[1, "wool_B", "tension_M", "tension_H"]::float8[],
'log') AS poisson_count
FROM warpbreaks_dummy w, glm_model m
ORDER BY w.id;
</pre>
<b>Example for Gaussian family:</b>
-# Create a testing data table
<pre class="example">
CREATE TABLE abalone (
id integer,
sex text,
length double precision,
diameter double precision,
height double precision,
whole double precision,
shucked double precision,
viscera double precision,
shell double precision,
rings integer
);
INSERT INTO abalone VALUES
(3151, 'F', 0.655000000000000027, 0.505000000000000004, 0.165000000000000008, 1.36699999999999999, 0.583500000000000019, 0.351499999999999979, 0.396000000000000019, 10),
(2026, 'F', 0.550000000000000044, 0.469999999999999973, 0.149999999999999994, 0.920499999999999985, 0.381000000000000005, 0.243499999999999994, 0.267500000000000016, 10),
(3751, 'I', 0.434999999999999998, 0.375, 0.110000000000000001, 0.41549999999999998, 0.170000000000000012, 0.0759999999999999981, 0.14499999999999999, 8),
(720, 'I', 0.149999999999999994, 0.100000000000000006, 0.0250000000000000014, 0.0149999999999999994, 0.00449999999999999966, 0.00400000000000000008, 0.0050000000000000001, 2),
(1635, 'F', 0.574999999999999956, 0.469999999999999973, 0.154999999999999999, 1.1160000000000001, 0.509000000000000008, 0.237999999999999989, 0.340000000000000024, 10),
(2648, 'I', 0.5, 0.390000000000000013, 0.125, 0.582999999999999963, 0.293999999999999984, 0.132000000000000006, 0.160500000000000004, 8),
(1796, 'F', 0.57999999999999996, 0.429999999999999993, 0.170000000000000012, 1.47999999999999998, 0.65349999999999997, 0.32400000000000001, 0.41549999999999998, 10),
(209, 'F', 0.525000000000000022, 0.41499999999999998, 0.170000000000000012, 0.832500000000000018, 0.275500000000000023, 0.168500000000000011, 0.309999999999999998, 13),
(1451, 'I', 0.455000000000000016, 0.33500000000000002, 0.135000000000000009, 0.501000000000000001, 0.274000000000000021, 0.0995000000000000051, 0.106499999999999997, 7),
(1108, 'I', 0.510000000000000009, 0.380000000000000004, 0.115000000000000005, 0.515499999999999958, 0.214999999999999997, 0.113500000000000004, 0.166000000000000009, 8),
(3675, 'F', 0.594999999999999973, 0.450000000000000011, 0.165000000000000008, 1.08099999999999996, 0.489999999999999991, 0.252500000000000002, 0.279000000000000026, 12),
(2108, 'F', 0.675000000000000044, 0.550000000000000044, 0.179999999999999993, 1.68849999999999989, 0.562000000000000055, 0.370499999999999996, 0.599999999999999978, 15),
(3312, 'F', 0.479999999999999982, 0.380000000000000004, 0.135000000000000009, 0.507000000000000006, 0.191500000000000004, 0.13650000000000001, 0.154999999999999999, 12),
(882, 'M', 0.655000000000000027, 0.520000000000000018, 0.165000000000000008, 1.40949999999999998, 0.585999999999999965, 0.290999999999999981, 0.405000000000000027, 9),
(3402, 'M', 0.479999999999999982, 0.395000000000000018, 0.149999999999999994, 0.681499999999999995, 0.214499999999999996, 0.140500000000000014, 0.2495, 18),
(829, 'I', 0.409999999999999976, 0.325000000000000011, 0.100000000000000006, 0.394000000000000017, 0.20799999999999999, 0.0655000000000000027, 0.105999999999999997, 6),
(1305, 'M', 0.535000000000000031, 0.434999999999999998, 0.149999999999999994, 0.716999999999999971, 0.347499999999999976, 0.14449999999999999, 0.194000000000000006, 9),
(3613, 'M', 0.599999999999999978, 0.46000000000000002, 0.179999999999999993, 1.1399999999999999, 0.422999999999999987, 0.257500000000000007, 0.364999999999999991, 10),
(1068, 'I', 0.340000000000000024, 0.265000000000000013, 0.0800000000000000017, 0.201500000000000012, 0.0899999999999999967, 0.0475000000000000006, 0.0550000000000000003, 5),
(2446, 'M', 0.5, 0.380000000000000004, 0.135000000000000009, 0.583500000000000019, 0.22950000000000001, 0.126500000000000001, 0.179999999999999993, 12),
(1393, 'M', 0.635000000000000009, 0.474999999999999978, 0.170000000000000012, 1.19350000000000001, 0.520499999999999963, 0.269500000000000017, 0.366499999999999992, 10),
(359, 'M', 0.744999999999999996, 0.584999999999999964, 0.214999999999999997, 2.49900000000000011, 0.92649999999999999, 0.471999999999999975, 0.699999999999999956, 17),
(549, 'F', 0.564999999999999947, 0.450000000000000011, 0.160000000000000003, 0.79500000000000004, 0.360499999999999987, 0.155499999999999999, 0.23000000000000001, 12),
(1154, 'F', 0.599999999999999978, 0.474999999999999978, 0.160000000000000003, 1.02649999999999997, 0.484999999999999987, 0.2495, 0.256500000000000006, 9),
(1790, 'F', 0.54500000000000004, 0.385000000000000009, 0.149999999999999994, 1.11850000000000005, 0.542499999999999982, 0.244499999999999995, 0.284499999999999975, 9),
(3703, 'F', 0.665000000000000036, 0.540000000000000036, 0.195000000000000007, 1.76400000000000001, 0.850500000000000034, 0.361499999999999988, 0.469999999999999973, 11),
(1962, 'F', 0.655000000000000027, 0.515000000000000013, 0.179999999999999993, 1.41199999999999992, 0.619500000000000051, 0.248499999999999999, 0.496999999999999997, 11),
(1665, 'I', 0.604999999999999982, 0.469999999999999973, 0.14499999999999999, 0.802499999999999991, 0.379000000000000004, 0.226500000000000007, 0.220000000000000001, 9),
(635, 'M', 0.359999999999999987, 0.294999999999999984, 0.100000000000000006, 0.210499999999999993, 0.0660000000000000031, 0.0524999999999999981, 0.0749999999999999972, 9),
(3901, 'M', 0.445000000000000007, 0.344999999999999973, 0.140000000000000013, 0.475999999999999979, 0.205499999999999988, 0.101500000000000007, 0.108499999999999999, 15),
(2734, 'I', 0.41499999999999998, 0.33500000000000002, 0.100000000000000006, 0.357999999999999985, 0.169000000000000011, 0.067000000000000004, 0.104999999999999996, 7),
(3856, 'M', 0.409999999999999976, 0.33500000000000002, 0.115000000000000005, 0.440500000000000003, 0.190000000000000002, 0.0850000000000000061, 0.135000000000000009, 8),
(827, 'I', 0.395000000000000018, 0.28999999999999998, 0.0950000000000000011, 0.303999999999999992, 0.127000000000000002, 0.0840000000000000052, 0.076999999999999999, 6),
(3381, 'I', 0.190000000000000002, 0.130000000000000004, 0.0449999999999999983, 0.0264999999999999993, 0.00899999999999999932, 0.0050000000000000001, 0.00899999999999999932, 5),
(3972, 'I', 0.400000000000000022, 0.294999999999999984, 0.0950000000000000011, 0.252000000000000002, 0.110500000000000001, 0.0575000000000000025, 0.0660000000000000031, 6),
(1155, 'M', 0.599999999999999978, 0.455000000000000016, 0.170000000000000012, 1.1915, 0.695999999999999952, 0.239499999999999991, 0.239999999999999991, 8),
(3467, 'M', 0.640000000000000013, 0.5, 0.170000000000000012, 1.4544999999999999, 0.642000000000000015, 0.357499999999999984, 0.353999999999999981, 9),
(2433, 'F', 0.609999999999999987, 0.484999999999999987, 0.165000000000000008, 1.08699999999999997, 0.425499999999999989, 0.232000000000000012, 0.380000000000000004, 11),
(552, 'I', 0.614999999999999991, 0.489999999999999991, 0.154999999999999999, 0.988500000000000045, 0.41449999999999998, 0.195000000000000007, 0.344999999999999973, 13),
(1425, 'F', 0.729999999999999982, 0.57999999999999996, 0.190000000000000002, 1.73750000000000004, 0.678499999999999992, 0.434499999999999997, 0.520000000000000018, 11),
(2402, 'F', 0.584999999999999964, 0.41499999999999998, 0.154999999999999999, 0.69850000000000001, 0.299999999999999989, 0.145999999999999991, 0.195000000000000007, 12),
(1748, 'F', 0.699999999999999956, 0.535000000000000031, 0.174999999999999989, 1.77299999999999991, 0.680499999999999994, 0.479999999999999982, 0.512000000000000011, 15),
(3983, 'I', 0.57999999999999996, 0.434999999999999998, 0.149999999999999994, 0.891499999999999959, 0.362999999999999989, 0.192500000000000004, 0.251500000000000001, 6),
(335, 'F', 0.739999999999999991, 0.599999999999999978, 0.195000000000000007, 1.97399999999999998, 0.597999999999999976, 0.408499999999999974, 0.709999999999999964, 16),
(1587, 'I', 0.515000000000000013, 0.349999999999999978, 0.104999999999999996, 0.474499999999999977, 0.212999999999999995, 0.122999999999999998, 0.127500000000000002, 10),
(2448, 'I', 0.275000000000000022, 0.204999999999999988, 0.0800000000000000017, 0.096000000000000002, 0.0359999999999999973, 0.0184999999999999991, 0.0299999999999999989, 6),
(1362, 'F', 0.604999999999999982, 0.474999999999999978, 0.174999999999999989, 1.07600000000000007, 0.463000000000000023, 0.219500000000000001, 0.33500000000000002, 9),
(2799, 'M', 0.640000000000000013, 0.484999999999999987, 0.149999999999999994, 1.09800000000000009, 0.519499999999999962, 0.222000000000000003, 0.317500000000000004, 10),
(1413, 'F', 0.67000000000000004, 0.505000000000000004, 0.174999999999999989, 1.01449999999999996, 0.4375, 0.271000000000000019, 0.3745, 10),
(1739, 'F', 0.67000000000000004, 0.540000000000000036, 0.195000000000000007, 1.61899999999999999, 0.739999999999999991, 0.330500000000000016, 0.465000000000000024, 11),
(1152, 'M', 0.584999999999999964, 0.465000000000000024, 0.160000000000000003, 0.955500000000000016, 0.45950000000000002, 0.235999999999999988, 0.265000000000000013, 7),
(2427, 'I', 0.564999999999999947, 0.434999999999999998, 0.154999999999999999, 0.782000000000000028, 0.271500000000000019, 0.16800000000000001, 0.284999999999999976, 14),
(1777, 'M', 0.484999999999999987, 0.369999999999999996, 0.154999999999999999, 0.967999999999999972, 0.418999999999999984, 0.245499999999999996, 0.236499999999999988, 9),
(3294, 'M', 0.574999999999999956, 0.455000000000000016, 0.184999999999999998, 1.15599999999999992, 0.552499999999999991, 0.242999999999999994, 0.294999999999999984, 13),
(1403, 'M', 0.650000000000000022, 0.510000000000000009, 0.190000000000000002, 1.54200000000000004, 0.715500000000000025, 0.373499999999999999, 0.375, 9),
(2256, 'M', 0.510000000000000009, 0.395000000000000018, 0.14499999999999999, 0.61850000000000005, 0.215999999999999998, 0.138500000000000012, 0.239999999999999991, 12),
(3984, 'F', 0.584999999999999964, 0.450000000000000011, 0.125, 0.873999999999999999, 0.354499999999999982, 0.20749999999999999, 0.225000000000000006, 6),
(1116, 'M', 0.525000000000000022, 0.405000000000000027, 0.119999999999999996, 0.755499999999999949, 0.3755, 0.155499999999999999, 0.201000000000000012, 9),
(1366, 'M', 0.609999999999999987, 0.474999999999999978, 0.170000000000000012, 1.02649999999999997, 0.434999999999999998, 0.233500000000000013, 0.303499999999999992, 10),
(3759, 'I', 0.525000000000000022, 0.400000000000000022, 0.140000000000000013, 0.605500000000000038, 0.260500000000000009, 0.107999999999999999, 0.209999999999999992, 9);
</pre>
-# Train a model with family=gaussian and link=identity
<pre class="example">
SELECT madlib.glm(
'abalone',
'abalone_out',
'rings',
'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]',
'family=gaussian, link=identity');
</pre>
<b>Example for Gamma family:</b> (reuse the dataset in Gaussian case)
-# Reuse the test data set in Gaussian
-# Train a model with family=gamma and link=inverse
<pre class="example">
SELECT madlib.glm(
'abalone',
'abalone_out',
'rings',
'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]',
'family=gamma, link=inverse');
</pre>
<b>Example for Inverse Gaussian family:</b> (reuse the dataset in Gaussian case)
-# Reuse the test data set in Gaussian
-# Train a model with family=inverse_gaussian and link=sqr_inverse
<pre class="example">
SELECT madlib.glm(
'abalone',
'abalone_out',
'rings',
'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]',
'family=inverse_gaussian, link=sqr_inverse');
</pre>
<b>Example for Binomial family:</b> (reuse the dataset in Gaussian case)
-# Reuse the test data set in Gaussian
-# Train a model with family=binomial and link=probit
<pre class="example">
SELECT madlib.glm(
'abalone',
'abalone_out',
'rings < 10',
'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]',
'family=binomial, link=probit');
</pre>
-# Predict output probabilities
<pre class="example">
SELECT madlib.glm_predict(
coef,
ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]::float8[],
'probit')
FROM abalone_out, abalone;
</pre>
-# Predict output categories
<pre class="example">
SELECT madlib.glm_predict(
SELECT madlib.glm_predict_binomial(
coef,
ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]::float8[],
'probit')
FROM abalone_out, abalone;
</pre>
@anchor notes
@par Notes
All table names can be optionally schema qualified (current_schemas() would be
searched if a schema name is not provided) and all table and column names
should follow case-sensitivity and quoting rules per the database.
(For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'.
If mixed-case or multi-byte characters are desired for entity names then the
string should be double-quoted; in this case the input would be '"MyTable"').
Currently implementation uses Newton's method and, according to performance tests,
when number of features are over 1000, this GLM function could be running slowly.
Functions in \ref grp_linreg is prefered to GLM with family=gaussian,link=identity,
as the former require only a single pass over the training data.
In addition, if user expects to use robust variance, clustered variance, or marginal
effects on top of the trained model, functions in \ref grp_linreg and
\ref grp_logreg should be used.
@anchor related
@par Related Topics
File glm.sql_in documenting the training function
\ref grp_linreg
\ref grp_logreg
\ref grp_mlogreg
\ref grp_robust
\ref grp_clustered_errors
\ref grp_validation
\ref grp_marginal
@internal
@sa Namespace GLM (documenting the driver/outer loop implemented in
Python), Namespace
\ref madlib::modules::regress documenting the implementation in C++
@endinternal
*/
------------------------------------------------------------------------
DROP TYPE IF EXISTS MADLIB_SCHEMA.__glm_result_type CASCADE;
CREATE TYPE MADLIB_SCHEMA.__glm_result_type AS (
coef double precision[],
loglik double precision,
std_err double precision[],
z_stats double precision[],
p_values double precision[],
num_rows_processed bigint,
dispersion double precision
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_merge_states(
state1 MADLIB_SCHEMA.bytea8,
state2 MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_merge_states'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_final(
state MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_final'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_poisson_log_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_poisson_log_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_poisson_log_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_poisson_log_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_poisson_log_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_poisson_identity_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_poisson_identity_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_poisson_identity_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_poisson_identity_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_poisson_identity_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_poisson_sqrt_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_poisson_sqrt_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_poisson_sqrt_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_poisson_sqrt_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_poisson_sqrt_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gaussian_identity_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_gaussian_identity_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gaussian_identity_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_gaussian_identity_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_gaussian_identity_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gaussian_log_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_gaussian_log_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gaussian_log_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_gaussian_log_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_gaussian_log_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gaussian_inverse_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_gaussian_inverse_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gaussian_inverse_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_gaussian_inverse_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_gaussian_inverse_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gamma_log_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_gamma_log_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gamma_log_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_gamma_log_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_gamma_log_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gamma_inverse_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_gamma_inverse_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gamma_inverse_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_gamma_inverse_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_gamma_inverse_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gamma_identity_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_gamma_identity_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gamma_identity_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_gamma_identity_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_gamma_identity_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_binomial_probit_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_binomial_probit_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_binomial_probit_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_binomial_probit_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_binomial_probit_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_identity_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_identity_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_binomial_logit_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_binomial_logit_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_binomial_logit_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_binomial_logit_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_binomial_logit_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_result_z_stats(
/*+ state */ MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.__glm_result_type
AS 'MODULE_PATHNAME', 'glm_result_z_stats'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_identity_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_identity_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_identity_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_log_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_log_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_log_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_log_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_log_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_inverse_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_transition(
MADLIB_SCHEMA.bytea8,
double precision,
double precision[],
MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.bytea8
AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_sqr_inverse_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_agg(
double precision, double precision[], MADLIB_SCHEMA.bytea8);
CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_agg(
/*+ y */ double precision,
/*+ x */ double precision[],
/*+ previous_state */ MADLIB_SCHEMA.bytea8) (
STYPE=MADLIB_SCHEMA.bytea8,
SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,')
FINALFUNC=MADLIB_SCHEMA.__glm_final,
INITCOND=''
);
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_result_z_stats(
/*+ state */ MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.__glm_result_type
AS 'MODULE_PATHNAME', 'glm_result_z_stats'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_result_t_stats(
/*+ state */ MADLIB_SCHEMA.bytea8)
RETURNS MADLIB_SCHEMA.__glm_result_type
AS 'MODULE_PATHNAME', 'glm_result_t_stats'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_loglik_diff(
/*+ state1 */ MADLIB_SCHEMA.bytea8,
/*+ state2 */ MADLIB_SCHEMA.bytea8)
RETURNS double precision
AS 'MODULE_PATHNAME', 'glm_loglik_diff'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
------------------------------------------------------------------------
------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm(
source_table varchar,
model_table varchar,
dependent_varname varchar,
independent_varname varchar,
family_params varchar,
grouping_col varchar,
optim_params varchar,
verbose boolean
) RETURNS void AS $$
PythonFunction(glm, glm, glm)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- entry functions with default values
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm(
source_table varchar,
model_table varchar,
dependent_varname varchar,
independent_varname varchar,
family_params varchar,
grouping_col varchar,
optim_params varchar
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.glm($1, $2, $3, $4, $5, $6, $7, FALSE);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm(
source_table varchar,
model_table varchar,
dependent_varname varchar,
independent_varname varchar,
family_params varchar,
grouping_col varchar
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.glm($1, $2, $3, $4, $5, $6, NULL, FALSE);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm(
source_table varchar,
model_table varchar,
dependent_varname varchar,
independent_varname varchar,
family_params varchar
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.glm($1, $2, $3, $4, $5, NULL, NULL, FALSE);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- skip the default family before we implement gaussian using this framework
-- Help messages -------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(glm, glm, glm_help_msg)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm()
RETURNS TEXT
AS $$
SELECT MADLIB_SCHEMA.glm(NULL::TEXT);
$$ LANGUAGE SQL IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-------------------------------------------------------------------------
-- Generalized linear model: Prediction ---------------------------------------------
/**
* @brief Predict the estimated mean value for the response variable given a specific
* predictor variable value in a generalized linear model
*
* @param coef Coefficients obtained by running generalized linear model.
* @param col_ind Predictor variable array
* @param link Link function used in training. Can be one of probit/logit
* @returns Numeric value of the predicted mean
*
* This function computes the dot product of the independent variables and the
* coefficients. This requires the length of the two vectors to be the same.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict(
coef DOUBLE PRECISION[],
col_ind_var DOUBLE PRECISION[],
link TEXT
) RETURNS DOUBLE PRECISION
AS 'MODULE_PATHNAME', 'glm_predict'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Predict the output category for the response variable given a specific
* predictor variable value in a generalized linear model
*
* @param coef Coefficients obtained by running generalized linear model.
* @param col_ind Predictor variable array
* @param link Link function used in training. Can be one of probit/logit.
* @returns True/False Boolean value corresponding to output category
True if predicted probability >= 0.5, False otherwise
*
* This function computes the dot product of the independent variables and the
* coefficients. This requires the length of the two vectors to be the same.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_binomial(
coef DOUBLE PRECISION[],
col_ind_var DOUBLE PRECISION[],
link TEXT
) RETURNS BOOLEAN
AS 'MODULE_PATHNAME', 'glm_predict_binomial'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Predict the estimated count for the response variable given a specific
* predictor variable value in a generalized linear model
*
* @param coef Coefficients obtained by running generalized linear model.
* @param col_ind Predictor variable array
* @param link Link function used in training
* @returns Numeric value of the predicted count, obtained by rounding the predicted mean
* to the nearest integral value.
*
* This function computes the dot product of the independent variables and the
* coefficients. This requires the length of the two vectors to be the same.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_poisson(
coef DOUBLE PRECISION[],
col_ind_var DOUBLE PRECISION[],
link TEXT
) RETURNS DOUBLE PRECISION
AS 'MODULE_PATHNAME', 'glm_predict_poisson'
LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-- Help messages -------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(glm, glm, glm_predict_help_msg)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict()
RETURNS TEXT
AS $$
SELECT MADLIB_SCHEMA.glm_predict(''::TEXT);
$$ LANGUAGE SQL IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_poisson(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(glm, glm, glm_predict_poisson_help_msg)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_binomial(
message TEXT
) RETURNS TEXT AS $$
PythonFunction(glm, glm, glm_predict_binomial_help_msg)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-------------------------------------------------------------