blob: fe0b878b6a07e1d0fb6d134e08f748f44b1bfdb8 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.4"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Marginal Effects</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
$(window).load(resizeHeight);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script src="../mathjax/MathJax.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">1.2</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__marginal.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
<!--BEGIN VERSIONS LINKS-->
<td style="padding-left: 0.5em;">
<div class="versionlist"><ul>
<li class="head">More versions:</li>
<li><a href="../v1.1/index.html">v1.1</li>
<li><a href="../v1.0/index.html">v1.0</li>
<li><a href="../v0.7/index.html">v0.7</li>
<li><a href="../v0.5/index.html">v0.5</li></ul>
</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.4 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__marginal.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark">&#160;</span>Groups</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Marginal Effects<div class="ingroups"><a class="el" href="group__grp__glm.html">Generalized Linear Models</a></div></div> </div>
</div><!--header-->
<div class="contents">
<dl class="section user"><dt>About</dt><dd></dd></dl>
<p>A marginal effect (ME) or partial effect measures the effect on the conditional mean of \( y \) of a change in one of the regressors, say \(X_k\). In the linear regression model, the ME equals the relevant slope coefficient, greatly simplifying analysis. For nonlinear models, we require specialized algorithms for calculating ME.</p>
<p>The standard approach to modeling dichotomous/binary variables (so \(y \in \{0, 1\} \)) is to estimate a generalized linear model under the assumption that \( y \) follows some form of Bernoulli distribution. Thus the expected value of \( y \) becomes, </p>
<p class="formulaDsp">
\[ y = G(X&#39; \beta), \]
</p>
<p>where G is the specified binomial distribution. For logistic regression, the funciton \( G \) represents the inverse logit function.</p>
<p>In logistic regression: </p>
<p class="formulaDsp">
\[ P = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + \dots \beta_j x_j)}} = \frac{1}{1 + e^{-z}} \implies \frac{\partial P}{\partial X_k} = \beta_k \cdot \frac{1}{1 + e^{-z}} \cdot \frac{e^{-z}}{1 + e^{-z}} \\ = \beta_k \cdot P \cdot (1-P) \]
</p>
<p>There are several methods for calculating the marginal effects for dichotomous dependent variables. This package uses the average of the marginal effects at every sample observation.</p>
<p>This is calculated as follows: </p>
<p class="formulaDsp">
\[ \frac{\partial y}{\partial x_k} = \beta_k \frac{\sum_{i=1}^n P(y_i = 1)(1-P(y_i = 1))}{n}, \\ \text{where}, P(y_i=1) = g(X^{(i)}\beta) \]
</p>
<p>We use the delta method for calculating standard errors on the marginal effects.</p>
<dl class="section user"><dt>Input</dt><dd></dd></dl>
<p>The input parameters are expected to be of the following form: </p>
<pre> <em>margins_logregr</em> (
<em>sourceTable</em> VARCHAR,
<em>outputTable</em> VARCHAR,
<em>dependentVariable</em> VARCHAR,
<em>independentVariable</em> VARCHAR,
[<em>groupingCol</em> VARCHAR,
<em>marginal_vars</em> INTEGER[],
<em>max_iter</em> INTEGER,
<em>optimizer</em> VARCHAR,
<em>tolerance</em> DOUBLE PRECISION
]
)</pre><p>For multinomial logistic regression the input parameters are expected to be of the following form: </p>
<pre> <em>margins_mlogregr</em> (
<em>sourceTable</em> VARCHAR,
<em>outputTable</em> VARCHAR,
<em>dependentVariable</em> VARCHAR,
<em>independentVariable</em> VARCHAR,
[<em>groupingCol</em> VARCHAR,
<em>referenceCategory</em> INTEGER,
<em>marginal_vars</em> INTEGER[],
<em>max_iter</em> INTEGER,
<em>optimizer</em> VARCHAR,
<em>tolerance</em> DOUBLE PRECISION
]
)</pre><dl class="section warning"><dt>Warning</dt><dd>The <b>'groupingCol'</b> input parameter for <em>margins_logregr</em> and <em>margins_mlogregr</em> is a placeholder in the Madlib V1.0. These inputs will be implemented in a future release. </dd></dl>
<dl class="section user"><dt>Usage</dt><dd></dd></dl>
<p><b> The Interface</b></p>
<p>For logistic regression, one can call the following function </p>
<pre>
SELECT madlib.<a class="el" href="marginal_8sql__in.html#ad37e0349ac3924daba27c4600311ed71">margins_logregr</a>(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variable, VARCHAR
[ <em>'grouping_cols'</em>, -- comma separated list of grouping vars, VARCHAR (Default NULL)
<em>'marginal_effect_variables'</em>, -- Index list (base 1) with positions representing which marginal variable to calculate, INTEGER[] (Default NULL)
<em> max_iter</em>, -- Maximum number of iterations to run the logistic regression INTEGER (Default 20)
<em>'optimizer'</em>, -- Optimizer used for logistic regression VARCHAR (Default &amp; recommended 'irls')
<em>tolerance</em>, -- Tolerance for the logistic regression DOUBLE PRECISION (default 1e-4)
]
);
</pre><p>For multinomial logistic regression, one can call the following function </p>
<pre>
SELECT madlib.margins_mlogregr(
<em>'source_table'</em>, -- name of input table, VARCHAR
<em>'out_table'</em>, -- name of output table, VARCHAR
<em>'dependent_varname'</em>, -- dependent variable, VARCHAR
<em>'independent_varname'</em>, -- independent variable, VARCHAR
[ <em>'reference_category'</em>, -- Reference category for multinomial logistic regression INTEGER
<em>'grouping_cols'</em>, -- comma separated list of grouping vars, VARCHAR (Default NULL)
<em>'marginal_effect_variables'</em>, -- Index list (base 1) with positions representing which marginal variable to calculate, INTEGER[] (Default NULL)
<em> max_iter</em>, -- Maximum number of iterations to run the logistic regression INTEGER (Default 20)
<em>'optimizer'</em>, -- Optimizer used for logistic regression VARCHAR (Default &amp; recommended 'irls')
<em>tolerance</em>, -- Tolerance for the logistic regression DOUBLE PRECISION (default 1e-4)
]
);
</pre><p>Output is stored in the <em>out_table</em>: </p>
<pre>
[ margins | std_err | t_stats | p_values |
+------+---------+-------+----------+
</pre><p><b> Multinomial-Logistic Regression Notes</b></p>
<ul>
<li>The reference category ranges from [0, numCategories-1]. The default reference category is zero.</li>
<li>The marginal effects and supporting statistics are output in the following order. For a problem with K dependent variables (1, ..., K) and J categories (0,..., J-1). The output is \( \{ m_{k_1, j_0} \ldots m_{k_K, j_{0}}, m_{k_1, j_1} \ldots m_{k_K, j_{1}}, m_{k_1, j_{J-1}} \ldots m_{k_K, j_{J-1}} \} \). The order is NOT CONSISTENT with mlogregr, robust_variance_mlogregr and clustered_variance_mlogregr. This is deliberate because the interfaces of all the other functions will be moved to match that used in marginal.</li>
<li>Selectively picking the marginal effects of the independent variables will return the marginal effects of ALL categories associated with the independent variable.</li>
<li>Here the <em>'independent_varname'</em> can be the name of a column, which contains array of numeric values. It can also have a format of string 'array[1, x1, x2, x3]', where <em>x1</em>, <em>x2</em> and <em>x3</em> are all column names.</li>
<li>Here the <em>'vars'</em> is an index list (base 1) with the set of indices amongst the independent variables that must be selected. One can also chose <em>NULL</em> to pick all indepependent variables.</li>
</ul>
<dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Create the sample data set: <pre class="fragment">sql&gt; SELECT * FROM data;
id | second_attack | treatment | trait_anxiety
----+---------------+-----------+---------------
1 | 1 | 1 | 70
3 | 1 | 1 | 50
5 | 1 | 0 | 40
7 | 1 | 0 | 75
9 | 1 | 0 | 70
11 | 0 | 1 | 65
13 | 0 | 1 | 45
15 | 0 | 1 | 40
17 | 0 | 0 | 55
...
</pre></li>
<li>For function summary information. Run <pre class="fragment">sql&gt; select margins_logregr('help');
OR
sql&gt; select margins_logregr();
OR
sql&gt; select margins_logregr('');
</pre></li>
<li>Run the logistic regression function and then compute the marginal effects of all variables in the regression: <pre class="fragment">sql&gt; select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]');
sql&gt; select * from result_table;
margins | {-0.970665392796,-0.156214190168,0.0181587690137}
coef | {-6.36346994178179,-1.02410605239327,0.119044916668605}
std_err | {0.802871454422,0.292691682191,0.0137459874022}
t_stats | {-1.2089922832,-0.533715850748,1.32102325446}
p_values | {0.243212810329,0.600447858606,0.204000202116}
-# Alternate Syntax: Run the logistic regression function and then compute the marginal effects if all variables in the regression:
sql&gt; select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, NULL);
sql&gt; select * from result_table;
margins | {-0.970665392796,-0.156214190168,0.0181587690137}
coef | {-6.36346994178179,-1.02410605239327,0.119044916668605}
std_err | {0.802871454422,0.292691682191,0.0137459874022}
t_stats | {-1.2089922832,-0.533715850748,1.32102325446}
p_values | {0.243212810329,0.600447858606,0.204000202116}
-# Run the logistic regression function and then compute the marginal effects of the first variable in the regression
sql&gt; select margins_logregr('patients', 'result_table', 'second_attack', 'ARRAY[1, treatment, trait_anxiety]', NULL, ARRAY[1]);
sql&gt; select * from result_table;
margins | {-0.970665392796}
coef | {-6.36346994178179}
std_err | {0.802871454422}
t_stats | {-1.2089922832}
p_values | {0.243212810329}
</pre></li>
</ol>
</dd></dl>
<dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Create the sample data set (the full dataset has 3 categories): <pre class="fragment">sql&gt; SELECT * FROM data;
id | feature_1 | feature_2 | category
----+---------------+-----------+---------------
1 | 1 | 35 | 1
3 | 2 | 33 | 0
5 | 3 | 39 | 1
7 | 1 | 37 | 1
9 | 2 | 31 | 1
11 | 3 | 36 | 1
13 | 2 | 36 | 1
15 | 2 | 36 | 0
17 | 2 | 31 | 5
...
</pre></li>
<li>For function summary information. Run <pre class="fragment">sql&gt; select margins_mlogregr('help');
OR
sql&gt; select margins_mlogregr();
OR
sql&gt; select margins_mlogregr('');
</pre></li>
<li>Run the regression function and then compute the marginal effects of all variables in the regression (see docs for detailed order) <pre class="fragment">sql&gt; select margins_mlogregr('test_data', 'result_table', 'category', 'ARRAY[1, feature_1, feature_2]');
sql&gt; select * from result_table;
margins | {0.741613239156,-0.032868883552,-0.0144502990691,-0.972055011831,0.112337273885,0.0172621628253}
std_err | {0.183172236055,0.044184899499,0.00332608999704,0.263532615748,0.0555196094594,0.00457999429836}
t_stats | {4.04872078394,-0.743894043547,-4.34453038911,-3.68855676202,2.02338011702,3.76903587663}
p_values | {7.43784735554e-05,0.457840607871,2.24855476205e-05,0.000292799037776,0.0444060346517,0.000217384008015}
</pre></li>
<li>Run the regression and compute the marginals effects for the first dependent variable (all categories) <pre class="fragment">sql&gt; select margins_mlogregr('test_data', 'result_table', 'category', 'ARRAY[1, feature_1, feature_2]', 0, NULL, ARRAY[1]);
sql&gt; select * from result_table;
margins | {0.741613239156,-0.972055011831}
std_err | {0.183172236055,0.263532615748}
t_stats | {4.04872078394,-3.68855676202}
p_values | {7.43784735554e-05,0.000292799037776}
</pre></li>
</ol>
</dd></dl>
<pre class="fragment">-# For function usage information.
sql&gt; select margins_mlogregr('usage');
</pre><dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] mfx function in STATA: <a href="http://www.stata.com/help.cgi?mfx_option">http://www.stata.com/help.cgi?mfx_option</a></p>
<dl class="section see"><dt>See Also</dt><dd>File <a class="el" href="marginal_8sql__in.html" title="SQL functions for linear regression. ">marginal.sql_in</a> documenting the SQL functions. </dd></dl>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Jan 9 2014 20:35:40 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li>
</ul>
</div>
</body>
</html>