| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file glm.sql_in |
| * |
| * @brief SQL functions for GLM (Poisson) |
| * @date June 2014 |
| * |
| * @sa For a brief introduction to GLM (Poisson), see the |
| * module description \ref grp_poisson. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_glm |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#train">Training Function</a></li> |
| <li class="level1"><a href="#predict">Prediction Function</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| @brief Estimate generalized linear model (GLM). |
| GLM is a flexible generalization of ordinary linear regression that allows for |
| response variables that have error distribution models other than a normal |
| distribution. The GLM generalizes linear regression by allowing the linear |
| model to be related to the response variable via a link function and by |
| allowing the magnitude of the variance of each measurement to be a function |
| of its predicted value. |
| |
| Generalized linear models extends ordinary linear regression by allowing the |
| response variable to follow a more general set of distributions (rather than |
| simply Gaussian distributions), and for a general family of functions of the |
| response variable (the link function) to vary linearly with the predicted values |
| (rather than assuming that the response itself must vary linearly). |
| |
| For example, data of counts would typically be modeled with a Poisson |
| distribution and a log link, while binary outcomes would typically be modeled |
| with a Bernoulli distribution (or binomial distribution, depending on exactly |
| how the problem is phrased) and a log-odds (or logit) link function. |
| |
| Currently, the implemented distribution families are |
| <center> |
| <table> |
| <tr> |
| <th>Distribution Family</th> |
| <th>Link Functions</th> |
| </tr> |
| <tr> |
| <td>Binomial</td> |
| <td>logit, probit</td> |
| </tr> |
| <tr> |
| <td>Gamma</td> |
| <td>inverse, identity, log</td> |
| </tr> |
| <tr> |
| <td>Gaussian</td> |
| <td>identity, inverse, log</td> |
| </tr> |
| <tr> |
| <td>Inverse Gaussian</td> |
| <td>inverse of square, inverse, identity, log</td> |
| </tr> |
| <tr> |
| <td>Poisson</td> |
| <td>log, identity, square-root<br></td> |
| </tr> |
| </table> |
| </center> |
| |
| @anchor train |
| @par Training Function |
| GLM training function has the following format: |
| <pre class="syntax"> |
| glm(source_table, |
| model_table, |
| dependent_varname, |
| independent_varname, |
| family_params, |
| grouping_col, |
| optim_params, |
| verbose |
| ) |
| </pre> |
| \b Arguments |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>TEXT. The name of the table containing the training data.</DD> |
| |
| <DT>model_table</DT> |
| <DD>TEXT. Name of the generated table containing the model. |
| |
| The model table produced by glm contains the following columns: |
| |
| <table class="output"> |
| <tr> |
| <th><...></th> |
| <td>Text. Grouping columns, if provided in input. This could be multiple columns |
| depending on the \c grouping_col input.</td> |
| </tr> |
| |
| <tr> |
| <th>coef</th> |
| <td>FLOAT8. Vector of the coefficients in linear predictor.</td> |
| </tr> |
| |
| <tr> |
| <th>log_likelihood</th> |
| <td>FLOAT8. The log-likelihood \f$ l(\boldsymbol \beta) \f$. We use the maximum likelihood estimate of |
| dispersion parameter to calculate the log-likelihood while R and Python use deviance estimate and |
| Pearson estimate respectively.</td> |
| </tr> |
| |
| <tr> |
| <th>std_err</th> |
| <td>FLOAT8[]. Vector of the standard error of the coefficients.</td> |
| </tr> |
| |
| <tr> |
| <th>z_stats or t_stats</th> |
| <td>FLOAT8[]. Vector of the z-statistics (in Poisson distribtuion and Binomial distribution) or the t-statistics (in all other distributions) of the coefficients.</td> |
| </tr> |
| |
| <tr> |
| <th>p_values</th> |
| <td>FLOAT8[]. Vector of the p-values of the coefficients.</td> |
| </tr> |
| |
| <tr> |
| <th>dispersion</th> |
| <td>FLOAT8. The dispersion value (Pearson estimate). When family=poisson or family=binomial, the dispersion is |
| always 1.</td> |
| </tr> |
| |
| <tr> |
| <th>num_rows_processed</th> |
| <td>BIGINT. Numbers of rows processed.</td> |
| </tr> |
| |
| <tr> |
| <th>num_rows_skipped</th> |
| <td>BIGINT. Numbers of rows skipped due to missing values or failures.</td> |
| </tr> |
| |
| <tr> |
| <th>num_iterations</th> |
| <td>INTEGER. The number of iterations actually completed. This would be different |
| from the \c nIterations argument if a \c tolerance parameter is provided and the |
| algorithm converges before all iterations are completed.</td> |
| </tr> |
| </table> |
| |
| A summary table named \<model_table\>_summary is also created at the same time, which has the following columns: |
| <table class="output"> |
| |
| <tr> |
| <th>method</th> |
| <td>'glm'</td> |
| </tr> |
| |
| <tr> |
| <th>source_table</th> |
| <td>The data source table name.</td> |
| </tr> |
| |
| <tr> |
| <th>model_table</th> |
| <td>The model table name.</td> |
| </tr> |
| |
| <tr> |
| <th>dependent_varname</th> |
| <td>The dependent variable.</td> |
| </tr> |
| |
| <tr> |
| <th>independent_varname</th> |
| <td>The independent variables</td> |
| </tr> |
| |
| <tr> |
| <th>family_params</th> |
| <td>A string that contains family parameters, and has the form of 'family=..., link=...'</td> |
| </tr> |
| |
| <tr> |
| <th>grouping_col</th> |
| <td>Name of grouping columns.</td> |
| </tr> |
| |
| <tr> |
| <th>optimizer_params</th> |
| <td>A string that contains optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'</td> |
| </tr> |
| |
| <tr> |
| <th>num_all_groups</th> |
| <td>Number of groups in glm training.</td> |
| </tr> |
| |
| <tr> |
| <th>num_failed_groups</th> |
| <td>Number of failed groups in glm training.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_processed</th> |
| <td>BIGINT. Total numbers of rows processed in all groups.</td> |
| </tr> |
| |
| <tr> |
| <th>total_rows_skipped</th> |
| <td>BIGINT. Total numbers of rows skipped in all groups due to missing values or failures.</td> |
| </tr> |
| |
| </table> |
| </DD> |
| |
| <DT>dependent_varname</DT> |
| <DD>TEXT. Name of the dependent variable column.</DD> |
| |
| <DT>independent_varname</DT> |
| <DD>TEXT. Expression list to evaluate for the |
| independent variables. An intercept variable is not assumed. It is common to |
| provide an explicit intercept term by including a single constant \c 1 term in |
| the independent variable list.</DD> |
| |
| <DT>family_params (optional)</DT> |
| <DD>TEXT, Parameters for distribution family. Currently, we |
| support |
| |
| (1) family=poisson and link=[log or identity or sqrt]. |
| |
| (2) family=gaussian and link=[identity or log or inverse]. |
| And when family=gaussian and link=identity, the GLM model is |
| exactly the same as the linear regression. |
| |
| (3) family=gamma and link=[inverse or identity or log]. |
| |
| (4) family=inverse_gaussian and link=[sqr_inverse or log or identity or inverse]. |
| |
| (5) family=binomial and link=[probit or logit]. |
| </DD> |
| |
| <DT>grouping_col (optional)</DT> |
| <DD>TEXT, default: NULL. An expression list used to group |
| the input dataset into discrete groups, running one regression per group. |
| Similar to the SQL "GROUP BY" clause. When this value is NULL, no |
| grouping is used and a single model is generated.</DD> |
| |
| <DT>optim_params (optional)</DT> |
| <DD>TEXT, default: 'max_iter=100,optimizer=irls,tolerance=1e-6'. |
| Parameters for optimizer. Currently, we support |
| tolerance=[tolerance for relative error between log-likelihoods], |
| max_iter=[maximum iterations to run], optimizer=irls.</DD> |
| |
| <DT>verbose (optional)</DT> |
| <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD> |
| </DL> |
| |
| <dd>@note For p-values, we just return the computation result directly. |
| Other statistical packages, like 'R', produce the same result, but on printing the |
| result to screen, another format function is used and any p-value that is |
| smaller than the machine epsilon (the smallest positive floating-point number |
| 'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the |
| value of the machine epsilon). Although the results may look different, they are |
| in fact the same. |
| </dd> |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function is provided to estimate the conditional mean given a new |
| predictor. It has the following syntax: |
| <pre class="syntax"> |
| glm_predict(coef, |
| col_ind_var |
| link) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>coef</DT> |
| <DD>DOUBLE PRECISION[]. Model coefficients obtained from \ref glm().</DD> |
| |
| <DT>col_ind_var</DT> |
| <DD>New predictor, as a DOUBLE array. This should be the same length |
| as the array obtained by evaluation of the 'independent_varname' argument in |
| \ref glm().</DD> |
| |
| <DT>link</DT> |
| <DD>link function, as a string. This should match the link function the user |
| inputted in \ref glm().</DD> |
| </DL> |
| |
| |
| @anchor examples |
| @examp |
| -# Create the training data table. |
| <pre class="example"> |
| CREATE TABLE warpbreaks( |
| id serial, |
| breaks integer, |
| wool char(1), |
| tension char(1) |
| ); |
| INSERT INTO warpbreaks(breaks, wool, tension) VALUES |
| (26, 'A', 'L'), |
| (30, 'A', 'L'), |
| (54, 'A', 'L'), |
| (25, 'A', 'L'), |
| (70, 'A', 'L'), |
| (52, 'A', 'L'), |
| (51, 'A', 'L'), |
| (26, 'A', 'L'), |
| (67, 'A', 'L'), |
| (18, 'A', 'M'), |
| (21, 'A', 'M'), |
| (29, 'A', 'M'), |
| (17, 'A', 'M'), |
| (12, 'A', 'M'), |
| (18, 'A', 'M'), |
| (35, 'A', 'M'), |
| (30, 'A', 'M'), |
| (36, 'A', 'M'), |
| (36, 'A', 'H'), |
| (21, 'A', 'H'), |
| (24, 'A', 'H'), |
| (18, 'A', 'H'), |
| (10, 'A', 'H'), |
| (43, 'A', 'H'), |
| (28, 'A', 'H'), |
| (15, 'A', 'H'), |
| (26, 'A', 'H'), |
| (27, 'B', 'L'), |
| (14, 'B', 'L'), |
| (29, 'B', 'L'), |
| (19, 'B', 'L'), |
| (29, 'B', 'L'), |
| (31, 'B', 'L'), |
| (41, 'B', 'L'), |
| (20, 'B', 'L'), |
| (44, 'B', 'L'), |
| (42, 'B', 'M'), |
| (26, 'B', 'M'), |
| (19, 'B', 'M'), |
| (16, 'B', 'M'), |
| (39, 'B', 'M'), |
| (28, 'B', 'M'), |
| (21, 'B', 'M'), |
| (39, 'B', 'M'), |
| (29, 'B', 'M'), |
| (20, 'B', 'H'), |
| (21, 'B', 'H'), |
| (24, 'B', 'H'), |
| (17, 'B', 'H'), |
| (13, 'B', 'H'), |
| (15, 'B', 'H'), |
| (15, 'B', 'H'), |
| (16, 'B', 'H'), |
| (28, 'B', 'H'); |
| SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 'wool,tension'); |
| </pre> |
| |
| -# Train a GLM model. |
| <pre class="example"> |
| SELECT glm('warpbreaks_dummy', |
| 'glm_model', |
| 'breaks', |
| 'ARRAY[1.0,"wool_B","tension_M", "tension_H"]', |
| 'family=poisson, link=log'); |
| </pre> |
| |
| -# View the regression results. |
| <pre class="example"> |
| -- Set extended display on for easier reading of output |
| \\x on |
| SELECT * FROM glm_model; |
| </pre> |
| Result: |
| <pre class="result"> |
| coef | {3.69196314494079,-0.205988442638621,-0.321320431600611,-0.51848849651156} |
| log_likelihood | -242.527983208979 |
| std_err | {0.04541079434248,0.0515712427835191,0.0602659166951256,0.0639595193956924} |
| z_stats | {81.3014438174473,-3.99425011926316,-5.3317106786264,-8.10651020224019} |
| p_values | {0,6.48993254938271e-05,9.72918600322907e-08,5.20943463005751e-16} |
| num_rows_processed | 54 |
| num_rows_skipped | 0 |
| iteration | 5 |
| </pre> |
| Alternatively, unnest the arrays in the results for easier reading of output: |
| <pre class="example"> |
| \\x off |
| SELECT unnest(coef) as coefficient, |
| unnest(std_err) as standard_error, |
| unnest(z_stats) as z_stat, |
| unnest(p_values) as pvalue |
| FROM glm_model; |
| </pre> |
| |
| -# Predicting dependent variable using GLM model. |
| (This example uses the original data table to perform the prediction. Typically |
| a different test dataset with the same features as the original training dataset |
| would be used for prediction.) |
| <pre class="example"> |
| \\x off |
| -- Display predicted mean value on the original dataset |
| SELECT |
| w.id, |
| madlib.glm_predict( |
| coef, |
| ARRAY[1, "wool_B", "tension_M", "tension_H"]::float8[], |
| 'log') AS mu |
| FROM warpbreaks_dummy w, glm_model m |
| ORDER BY w.id; |
| </pre> |
| <pre class="example"> |
| -- Display predicted counts (which are predicted mean values rounded to the nearest integral value) on the original dataset |
| SELECT |
| w.id, |
| madlib.glm_predict_poisson( |
| coef, |
| ARRAY[1, "wool_B", "tension_M", "tension_H"]::float8[], |
| 'log') AS poisson_count |
| FROM warpbreaks_dummy w, glm_model m |
| ORDER BY w.id; |
| </pre> |
| |
| |
| <b>Example for Gaussian family:</b> |
| |
| -# Create a testing data table |
| <pre class="example"> |
| CREATE TABLE abalone ( |
| id integer, |
| sex text, |
| length double precision, |
| diameter double precision, |
| height double precision, |
| whole double precision, |
| shucked double precision, |
| viscera double precision, |
| shell double precision, |
| rings integer |
| ); |
| INSERT INTO abalone VALUES |
| (3151, 'F', 0.655000000000000027, 0.505000000000000004, 0.165000000000000008, 1.36699999999999999, 0.583500000000000019, 0.351499999999999979, 0.396000000000000019, 10), |
| (2026, 'F', 0.550000000000000044, 0.469999999999999973, 0.149999999999999994, 0.920499999999999985, 0.381000000000000005, 0.243499999999999994, 0.267500000000000016, 10), |
| (3751, 'I', 0.434999999999999998, 0.375, 0.110000000000000001, 0.41549999999999998, 0.170000000000000012, 0.0759999999999999981, 0.14499999999999999, 8), |
| (720, 'I', 0.149999999999999994, 0.100000000000000006, 0.0250000000000000014, 0.0149999999999999994, 0.00449999999999999966, 0.00400000000000000008, 0.0050000000000000001, 2), |
| (1635, 'F', 0.574999999999999956, 0.469999999999999973, 0.154999999999999999, 1.1160000000000001, 0.509000000000000008, 0.237999999999999989, 0.340000000000000024, 10), |
| (2648, 'I', 0.5, 0.390000000000000013, 0.125, 0.582999999999999963, 0.293999999999999984, 0.132000000000000006, 0.160500000000000004, 8), |
| (1796, 'F', 0.57999999999999996, 0.429999999999999993, 0.170000000000000012, 1.47999999999999998, 0.65349999999999997, 0.32400000000000001, 0.41549999999999998, 10), |
| (209, 'F', 0.525000000000000022, 0.41499999999999998, 0.170000000000000012, 0.832500000000000018, 0.275500000000000023, 0.168500000000000011, 0.309999999999999998, 13), |
| (1451, 'I', 0.455000000000000016, 0.33500000000000002, 0.135000000000000009, 0.501000000000000001, 0.274000000000000021, 0.0995000000000000051, 0.106499999999999997, 7), |
| (1108, 'I', 0.510000000000000009, 0.380000000000000004, 0.115000000000000005, 0.515499999999999958, 0.214999999999999997, 0.113500000000000004, 0.166000000000000009, 8), |
| (3675, 'F', 0.594999999999999973, 0.450000000000000011, 0.165000000000000008, 1.08099999999999996, 0.489999999999999991, 0.252500000000000002, 0.279000000000000026, 12), |
| (2108, 'F', 0.675000000000000044, 0.550000000000000044, 0.179999999999999993, 1.68849999999999989, 0.562000000000000055, 0.370499999999999996, 0.599999999999999978, 15), |
| (3312, 'F', 0.479999999999999982, 0.380000000000000004, 0.135000000000000009, 0.507000000000000006, 0.191500000000000004, 0.13650000000000001, 0.154999999999999999, 12), |
| (882, 'M', 0.655000000000000027, 0.520000000000000018, 0.165000000000000008, 1.40949999999999998, 0.585999999999999965, 0.290999999999999981, 0.405000000000000027, 9), |
| (3402, 'M', 0.479999999999999982, 0.395000000000000018, 0.149999999999999994, 0.681499999999999995, 0.214499999999999996, 0.140500000000000014, 0.2495, 18), |
| (829, 'I', 0.409999999999999976, 0.325000000000000011, 0.100000000000000006, 0.394000000000000017, 0.20799999999999999, 0.0655000000000000027, 0.105999999999999997, 6), |
| (1305, 'M', 0.535000000000000031, 0.434999999999999998, 0.149999999999999994, 0.716999999999999971, 0.347499999999999976, 0.14449999999999999, 0.194000000000000006, 9), |
| (3613, 'M', 0.599999999999999978, 0.46000000000000002, 0.179999999999999993, 1.1399999999999999, 0.422999999999999987, 0.257500000000000007, 0.364999999999999991, 10), |
| (1068, 'I', 0.340000000000000024, 0.265000000000000013, 0.0800000000000000017, 0.201500000000000012, 0.0899999999999999967, 0.0475000000000000006, 0.0550000000000000003, 5), |
| (2446, 'M', 0.5, 0.380000000000000004, 0.135000000000000009, 0.583500000000000019, 0.22950000000000001, 0.126500000000000001, 0.179999999999999993, 12), |
| (1393, 'M', 0.635000000000000009, 0.474999999999999978, 0.170000000000000012, 1.19350000000000001, 0.520499999999999963, 0.269500000000000017, 0.366499999999999992, 10), |
| (359, 'M', 0.744999999999999996, 0.584999999999999964, 0.214999999999999997, 2.49900000000000011, 0.92649999999999999, 0.471999999999999975, 0.699999999999999956, 17), |
| (549, 'F', 0.564999999999999947, 0.450000000000000011, 0.160000000000000003, 0.79500000000000004, 0.360499999999999987, 0.155499999999999999, 0.23000000000000001, 12), |
| (1154, 'F', 0.599999999999999978, 0.474999999999999978, 0.160000000000000003, 1.02649999999999997, 0.484999999999999987, 0.2495, 0.256500000000000006, 9), |
| (1790, 'F', 0.54500000000000004, 0.385000000000000009, 0.149999999999999994, 1.11850000000000005, 0.542499999999999982, 0.244499999999999995, 0.284499999999999975, 9), |
| (3703, 'F', 0.665000000000000036, 0.540000000000000036, 0.195000000000000007, 1.76400000000000001, 0.850500000000000034, 0.361499999999999988, 0.469999999999999973, 11), |
| (1962, 'F', 0.655000000000000027, 0.515000000000000013, 0.179999999999999993, 1.41199999999999992, 0.619500000000000051, 0.248499999999999999, 0.496999999999999997, 11), |
| (1665, 'I', 0.604999999999999982, 0.469999999999999973, 0.14499999999999999, 0.802499999999999991, 0.379000000000000004, 0.226500000000000007, 0.220000000000000001, 9), |
| (635, 'M', 0.359999999999999987, 0.294999999999999984, 0.100000000000000006, 0.210499999999999993, 0.0660000000000000031, 0.0524999999999999981, 0.0749999999999999972, 9), |
| (3901, 'M', 0.445000000000000007, 0.344999999999999973, 0.140000000000000013, 0.475999999999999979, 0.205499999999999988, 0.101500000000000007, 0.108499999999999999, 15), |
| (2734, 'I', 0.41499999999999998, 0.33500000000000002, 0.100000000000000006, 0.357999999999999985, 0.169000000000000011, 0.067000000000000004, 0.104999999999999996, 7), |
| (3856, 'M', 0.409999999999999976, 0.33500000000000002, 0.115000000000000005, 0.440500000000000003, 0.190000000000000002, 0.0850000000000000061, 0.135000000000000009, 8), |
| (827, 'I', 0.395000000000000018, 0.28999999999999998, 0.0950000000000000011, 0.303999999999999992, 0.127000000000000002, 0.0840000000000000052, 0.076999999999999999, 6), |
| (3381, 'I', 0.190000000000000002, 0.130000000000000004, 0.0449999999999999983, 0.0264999999999999993, 0.00899999999999999932, 0.0050000000000000001, 0.00899999999999999932, 5), |
| (3972, 'I', 0.400000000000000022, 0.294999999999999984, 0.0950000000000000011, 0.252000000000000002, 0.110500000000000001, 0.0575000000000000025, 0.0660000000000000031, 6), |
| (1155, 'M', 0.599999999999999978, 0.455000000000000016, 0.170000000000000012, 1.1915, 0.695999999999999952, 0.239499999999999991, 0.239999999999999991, 8), |
| (3467, 'M', 0.640000000000000013, 0.5, 0.170000000000000012, 1.4544999999999999, 0.642000000000000015, 0.357499999999999984, 0.353999999999999981, 9), |
| (2433, 'F', 0.609999999999999987, 0.484999999999999987, 0.165000000000000008, 1.08699999999999997, 0.425499999999999989, 0.232000000000000012, 0.380000000000000004, 11), |
| (552, 'I', 0.614999999999999991, 0.489999999999999991, 0.154999999999999999, 0.988500000000000045, 0.41449999999999998, 0.195000000000000007, 0.344999999999999973, 13), |
| (1425, 'F', 0.729999999999999982, 0.57999999999999996, 0.190000000000000002, 1.73750000000000004, 0.678499999999999992, 0.434499999999999997, 0.520000000000000018, 11), |
| (2402, 'F', 0.584999999999999964, 0.41499999999999998, 0.154999999999999999, 0.69850000000000001, 0.299999999999999989, 0.145999999999999991, 0.195000000000000007, 12), |
| (1748, 'F', 0.699999999999999956, 0.535000000000000031, 0.174999999999999989, 1.77299999999999991, 0.680499999999999994, 0.479999999999999982, 0.512000000000000011, 15), |
| (3983, 'I', 0.57999999999999996, 0.434999999999999998, 0.149999999999999994, 0.891499999999999959, 0.362999999999999989, 0.192500000000000004, 0.251500000000000001, 6), |
| (335, 'F', 0.739999999999999991, 0.599999999999999978, 0.195000000000000007, 1.97399999999999998, 0.597999999999999976, 0.408499999999999974, 0.709999999999999964, 16), |
| (1587, 'I', 0.515000000000000013, 0.349999999999999978, 0.104999999999999996, 0.474499999999999977, 0.212999999999999995, 0.122999999999999998, 0.127500000000000002, 10), |
| (2448, 'I', 0.275000000000000022, 0.204999999999999988, 0.0800000000000000017, 0.096000000000000002, 0.0359999999999999973, 0.0184999999999999991, 0.0299999999999999989, 6), |
| (1362, 'F', 0.604999999999999982, 0.474999999999999978, 0.174999999999999989, 1.07600000000000007, 0.463000000000000023, 0.219500000000000001, 0.33500000000000002, 9), |
| (2799, 'M', 0.640000000000000013, 0.484999999999999987, 0.149999999999999994, 1.09800000000000009, 0.519499999999999962, 0.222000000000000003, 0.317500000000000004, 10), |
| (1413, 'F', 0.67000000000000004, 0.505000000000000004, 0.174999999999999989, 1.01449999999999996, 0.4375, 0.271000000000000019, 0.3745, 10), |
| (1739, 'F', 0.67000000000000004, 0.540000000000000036, 0.195000000000000007, 1.61899999999999999, 0.739999999999999991, 0.330500000000000016, 0.465000000000000024, 11), |
| (1152, 'M', 0.584999999999999964, 0.465000000000000024, 0.160000000000000003, 0.955500000000000016, 0.45950000000000002, 0.235999999999999988, 0.265000000000000013, 7), |
| (2427, 'I', 0.564999999999999947, 0.434999999999999998, 0.154999999999999999, 0.782000000000000028, 0.271500000000000019, 0.16800000000000001, 0.284999999999999976, 14), |
| (1777, 'M', 0.484999999999999987, 0.369999999999999996, 0.154999999999999999, 0.967999999999999972, 0.418999999999999984, 0.245499999999999996, 0.236499999999999988, 9), |
| (3294, 'M', 0.574999999999999956, 0.455000000000000016, 0.184999999999999998, 1.15599999999999992, 0.552499999999999991, 0.242999999999999994, 0.294999999999999984, 13), |
| (1403, 'M', 0.650000000000000022, 0.510000000000000009, 0.190000000000000002, 1.54200000000000004, 0.715500000000000025, 0.373499999999999999, 0.375, 9), |
| (2256, 'M', 0.510000000000000009, 0.395000000000000018, 0.14499999999999999, 0.61850000000000005, 0.215999999999999998, 0.138500000000000012, 0.239999999999999991, 12), |
| (3984, 'F', 0.584999999999999964, 0.450000000000000011, 0.125, 0.873999999999999999, 0.354499999999999982, 0.20749999999999999, 0.225000000000000006, 6), |
| (1116, 'M', 0.525000000000000022, 0.405000000000000027, 0.119999999999999996, 0.755499999999999949, 0.3755, 0.155499999999999999, 0.201000000000000012, 9), |
| (1366, 'M', 0.609999999999999987, 0.474999999999999978, 0.170000000000000012, 1.02649999999999997, 0.434999999999999998, 0.233500000000000013, 0.303499999999999992, 10), |
| (3759, 'I', 0.525000000000000022, 0.400000000000000022, 0.140000000000000013, 0.605500000000000038, 0.260500000000000009, 0.107999999999999999, 0.209999999999999992, 9); |
| </pre> |
| |
| -# Train a model with family=gaussian and link=identity |
| <pre class="example"> |
| SELECT madlib.glm( |
| 'abalone', |
| 'abalone_out', |
| 'rings', |
| 'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]', |
| 'family=gaussian, link=identity'); |
| </pre> |
| |
| |
| <b>Example for Gamma family:</b> (reuse the dataset in Gaussian case) |
| |
| -# Reuse the test data set in Gaussian |
| |
| -# Train a model with family=gamma and link=inverse |
| <pre class="example"> |
| SELECT madlib.glm( |
| 'abalone', |
| 'abalone_out', |
| 'rings', |
| 'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]', |
| 'family=gamma, link=inverse'); |
| </pre> |
| |
| |
| <b>Example for Inverse Gaussian family:</b> (reuse the dataset in Gaussian case) |
| |
| -# Reuse the test data set in Gaussian |
| |
| -# Train a model with family=inverse_gaussian and link=sqr_inverse |
| <pre class="example"> |
| SELECT madlib.glm( |
| 'abalone', |
| 'abalone_out', |
| 'rings', |
| 'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]', |
| 'family=inverse_gaussian, link=sqr_inverse'); |
| </pre> |
| |
| |
| <b>Example for Binomial family:</b> (reuse the dataset in Gaussian case) |
| |
| -# Reuse the test data set in Gaussian |
| |
| -# Train a model with family=binomial and link=probit |
| <pre class="example"> |
| SELECT madlib.glm( |
| 'abalone', |
| 'abalone_out', |
| 'rings < 10', |
| 'ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]', |
| 'family=binomial, link=probit'); |
| </pre> |
| |
| -# Predict output probabilities |
| <pre class="example"> |
| SELECT madlib.glm_predict( |
| coef, |
| ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]::float8[], |
| 'probit') |
| FROM abalone_out, abalone; |
| </pre> |
| |
| -# Predict output categories |
| <pre class="example"> |
| SELECT madlib.glm_predict( |
| SELECT madlib.glm_predict_binomial( |
| coef, |
| ARRAY[1, length, diameter, height, whole, shucked, viscera, shell]::float8[], |
| 'probit') |
| FROM abalone_out, abalone; |
| </pre> |
| |
| |
| @anchor notes |
| @par Notes |
| All table names can be optionally schema qualified (current_schemas() would be |
| searched if a schema name is not provided) and all table and column names |
| should follow case-sensitivity and quoting rules per the database. |
| (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'. |
| If mixed-case or multi-byte characters are desired for entity names then the |
| string should be double-quoted; in this case the input would be '"MyTable"'). |
| |
| Currently implementation uses Newton's method and, according to performance tests, |
| when number of features are over 1000, this GLM function could be running slowly. |
| |
| Functions in \ref grp_linreg is prefered to GLM with family=gaussian,link=identity, |
| as the former require only a single pass over the training data. |
| In addition, if user expects to use robust variance, clustered variance, or marginal |
| effects on top of the trained model, functions in \ref grp_linreg and |
| \ref grp_logreg should be used. |
| |
| |
| @anchor related |
| @par Related Topics |
| |
| File glm.sql_in documenting the training function |
| |
| \ref grp_linreg |
| |
| \ref grp_logreg |
| |
| \ref grp_mlogreg |
| |
| \ref grp_robust |
| |
| \ref grp_clustered_errors |
| |
| \ref grp_validation |
| |
| \ref grp_marginal |
| |
| @internal |
| @sa Namespace GLM (documenting the driver/outer loop implemented in |
| Python), Namespace |
| \ref madlib::modules::regress documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| |
| ------------------------------------------------------------------------ |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.__glm_result_type CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.__glm_result_type AS ( |
| coef double precision[], |
| loglik double precision, |
| std_err double precision[], |
| z_stats double precision[], |
| p_values double precision[], |
| num_rows_processed bigint, |
| dispersion double precision |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_merge_states( |
| state1 MADLIB_SCHEMA.bytea8, |
| state2 MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_merge_states' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_final( |
| state MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_final' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_poisson_log_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_poisson_log_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_poisson_log_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_poisson_log_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_poisson_log_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_poisson_identity_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_poisson_identity_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_poisson_identity_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_poisson_identity_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_poisson_identity_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_poisson_sqrt_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_poisson_sqrt_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_poisson_sqrt_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_poisson_sqrt_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_poisson_sqrt_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gaussian_identity_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_gaussian_identity_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gaussian_identity_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_gaussian_identity_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_gaussian_identity_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gaussian_log_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_gaussian_log_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gaussian_log_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_gaussian_log_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_gaussian_log_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gaussian_inverse_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_gaussian_inverse_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gaussian_inverse_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_gaussian_inverse_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_gaussian_inverse_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gamma_log_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_gamma_log_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| ------------------------------------------------------------------------ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gamma_log_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_gamma_log_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_gamma_log_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gamma_inverse_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_gamma_inverse_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gamma_inverse_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_gamma_inverse_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_gamma_inverse_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_gamma_identity_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_gamma_identity_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_gamma_identity_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_gamma_identity_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_gamma_identity_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_binomial_probit_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_binomial_probit_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_binomial_probit_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_binomial_probit_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_binomial_probit_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_identity_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_identity_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_binomial_logit_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_binomial_logit_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_binomial_logit_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_binomial_logit_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_binomial_logit_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_result_z_stats( |
| /*+ state */ MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.__glm_result_type |
| AS 'MODULE_PATHNAME', 'glm_result_z_stats' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_identity_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_identity_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_identity_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_log_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_log_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_log_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_log_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_log_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_inverse_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_inverse_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_transition( |
| MADLIB_SCHEMA.bytea8, |
| double precision, |
| double precision[], |
| MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.bytea8 |
| AS 'MODULE_PATHNAME', 'glm_inverse_gaussian_sqr_inverse_transition' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_agg( |
| double precision, double precision[], MADLIB_SCHEMA.bytea8); |
| CREATE AGGREGATE MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_agg( |
| /*+ y */ double precision, |
| /*+ x */ double precision[], |
| /*+ previous_state */ MADLIB_SCHEMA.bytea8) ( |
| |
| STYPE=MADLIB_SCHEMA.bytea8, |
| SFUNC=MADLIB_SCHEMA.__glm_inverse_gaussian_sqr_inverse_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__glm_merge_states,') |
| FINALFUNC=MADLIB_SCHEMA.__glm_final, |
| INITCOND='' |
| ); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_result_z_stats( |
| /*+ state */ MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.__glm_result_type |
| AS 'MODULE_PATHNAME', 'glm_result_z_stats' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_result_t_stats( |
| /*+ state */ MADLIB_SCHEMA.bytea8) |
| RETURNS MADLIB_SCHEMA.__glm_result_type |
| AS 'MODULE_PATHNAME', 'glm_result_t_stats' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__glm_loglik_diff( |
| /*+ state1 */ MADLIB_SCHEMA.bytea8, |
| /*+ state2 */ MADLIB_SCHEMA.bytea8) |
| RETURNS double precision |
| AS 'MODULE_PATHNAME', 'glm_loglik_diff' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| ------------------------------------------------------------------------ |
| ------------------------------------------------------------------------ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm( |
| source_table varchar, |
| model_table varchar, |
| dependent_varname varchar, |
| independent_varname varchar, |
| family_params varchar, |
| grouping_col varchar, |
| optim_params varchar, |
| verbose boolean |
| ) RETURNS void AS $$ |
| PythonFunction(glm, glm, glm) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- entry functions with default values |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm( |
| source_table varchar, |
| model_table varchar, |
| dependent_varname varchar, |
| independent_varname varchar, |
| family_params varchar, |
| grouping_col varchar, |
| optim_params varchar |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.glm($1, $2, $3, $4, $5, $6, $7, FALSE); |
| $$ LANGUAGE sql |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm( |
| source_table varchar, |
| model_table varchar, |
| dependent_varname varchar, |
| independent_varname varchar, |
| family_params varchar, |
| grouping_col varchar |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.glm($1, $2, $3, $4, $5, $6, NULL, FALSE); |
| $$ LANGUAGE sql |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm( |
| source_table varchar, |
| model_table varchar, |
| dependent_varname varchar, |
| independent_varname varchar, |
| family_params varchar |
| ) RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.glm($1, $2, $3, $4, $5, NULL, NULL, FALSE); |
| $$ LANGUAGE sql |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- skip the default family before we implement gaussian using this framework |
| |
| |
| -- Help messages ------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(glm, glm, glm_help_msg) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm() |
| RETURNS TEXT |
| AS $$ |
| SELECT MADLIB_SCHEMA.glm(NULL::TEXT); |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| |
| ------------------------------------------------------------------------- |
| -- Generalized linear model: Prediction --------------------------------------------- |
| /** |
| * @brief Predict the estimated mean value for the response variable given a specific |
| * predictor variable value in a generalized linear model |
| * |
| * @param coef Coefficients obtained by running generalized linear model. |
| * @param col_ind Predictor variable array |
| * @param link Link function used in training. Can be one of probit/logit |
| * @returns Numeric value of the predicted mean |
| * |
| * This function computes the dot product of the independent variables and the |
| * coefficients. This requires the length of the two vectors to be the same. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict( |
| coef DOUBLE PRECISION[], |
| col_ind_var DOUBLE PRECISION[], |
| link TEXT |
| ) RETURNS DOUBLE PRECISION |
| AS 'MODULE_PATHNAME', 'glm_predict' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /** |
| * @brief Predict the output category for the response variable given a specific |
| * predictor variable value in a generalized linear model |
| * |
| * @param coef Coefficients obtained by running generalized linear model. |
| * @param col_ind Predictor variable array |
| * @param link Link function used in training. Can be one of probit/logit. |
| * @returns True/False Boolean value corresponding to output category |
| True if predicted probability >= 0.5, False otherwise |
| * |
| * This function computes the dot product of the independent variables and the |
| * coefficients. This requires the length of the two vectors to be the same. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_binomial( |
| coef DOUBLE PRECISION[], |
| col_ind_var DOUBLE PRECISION[], |
| link TEXT |
| ) RETURNS BOOLEAN |
| AS 'MODULE_PATHNAME', 'glm_predict_binomial' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /** |
| * @brief Predict the estimated count for the response variable given a specific |
| * predictor variable value in a generalized linear model |
| * |
| * @param coef Coefficients obtained by running generalized linear model. |
| * @param col_ind Predictor variable array |
| * @param link Link function used in training |
| * @returns Numeric value of the predicted count, obtained by rounding the predicted mean |
| * to the nearest integral value. |
| * |
| * This function computes the dot product of the independent variables and the |
| * coefficients. This requires the length of the two vectors to be the same. |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_poisson( |
| coef DOUBLE PRECISION[], |
| col_ind_var DOUBLE PRECISION[], |
| link TEXT |
| ) RETURNS DOUBLE PRECISION |
| AS 'MODULE_PATHNAME', 'glm_predict_poisson' |
| LANGUAGE C STRICT IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| -- Help messages ------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(glm, glm, glm_predict_help_msg) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict() |
| RETURNS TEXT |
| AS $$ |
| SELECT MADLIB_SCHEMA.glm_predict(''::TEXT); |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_poisson( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(glm, glm, glm_predict_poisson_help_msg) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.glm_predict_binomial( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(glm, glm, glm_predict_binomial_help_msg) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| ------------------------------------------------------------- |