| <!-- HTML header for doxygen 1.8.4--> |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> |
| <meta http-equiv="X-UA-Compatible" content="IE=9"/> |
| <meta name="generator" content="Doxygen 1.8.4"/> |
| <meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> |
| <title>MADlib: Clustered Variance</title> |
| <link href="tabs.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="jquery.js"></script> |
| <script type="text/javascript" src="dynsections.js"></script> |
| <link href="navtree.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="resize.js"></script> |
| <script type="text/javascript" src="navtree.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(initResizable); |
| $(window).load(resizeHeight); |
| </script> |
| <link href="search/search.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="search/search.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(function() { searchBox.OnSelectItem(0); }); |
| </script> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], |
| jax: ["input/TeX","output/HTML-CSS"], |
| }); |
| </script><script src="../mathjax/MathJax.js"></script> |
| <link href="doxygen.css" rel="stylesheet" type="text/css" /> |
| <link href="madlib_extra.css" rel="stylesheet" type="text/css"/> |
| <!-- google analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); |
| ga('create', 'UA-45382226-1', 'auto'); |
| ga('send', 'pageview'); |
| </script> |
| </head> |
| <body> |
| <div id="top"><!-- do not remove this div, it is closed by doxygen! --> |
| <div id="titlearea"> |
| <table cellspacing="0" cellpadding="0"> |
| <tbody> |
| <tr style="height: 56px;"> |
| <td style="padding-left: 0.5em;"> |
| <div id="projectname">MADlib |
|  <span id="projectnumber">1.2</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__clustered__errors.html"> A newer version is available</a></span> |
| </div> |
| <div id="projectbrief">User Documentation</div> |
| </td> |
| <!--BEGIN VERSIONS LINKS--> |
| <td style="padding-left: 0.5em;"> |
| <div class="versionlist"><ul> |
| <li class="head">More versions:</li> |
| <li><a href="../v1.1/index.html">v1.1</li> |
| <li><a href="../v1.0/index.html">v1.0</li> |
| <li><a href="../v0.7/index.html">v0.7</li> |
| <li><a href="../v0.5/index.html">v0.5</li></ul> |
| </div> |
| </td> |
| <td> <div id="MSearchBox" class="MSearchBoxInactive"> |
| <span class="left"> |
| <img id="MSearchSelect" src="search/mag_sel.png" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| alt=""/> |
| <input type="text" id="MSearchField" value="Search" accesskey="S" |
| onfocus="searchBox.OnSearchFieldFocus(true)" |
| onblur="searchBox.OnSearchFieldFocus(false)" |
| onkeyup="searchBox.OnSearchFieldChange(event)"/> |
| </span><span class="right"> |
| <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> |
| </span> |
| </div> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <!-- end header part --> |
| <!-- Generated by Doxygen 1.8.4 --> |
| <script type="text/javascript"> |
| var searchBox = new SearchBox("searchBox", "search",false,'Search'); |
| </script> |
| </div><!-- top --> |
| <div id="side-nav" class="ui-resizable side-nav-resizable"> |
| <div id="nav-tree"> |
| <div id="nav-tree-contents"> |
| <div id="nav-sync" class="sync"></div> |
| </div> |
| </div> |
| <div id="splitbar" style="-moz-user-select:none;" |
| class="ui-resizable-handle"> |
| </div> |
| </div> |
| <script type="text/javascript"> |
| $(document).ready(function(){initNavTree('group__grp__clustered__errors.html','');}); |
| </script> |
| <div id="doc-content"> |
| <!-- window showing the filter options --> |
| <div id="MSearchSelectWindow" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| onkeydown="return searchBox.OnSearchSelectKey(event)"> |
| <a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark"> </span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark"> </span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark"> </span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark"> </span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark"> </span>Groups</a></div> |
| |
| <!-- iframe showing the search results (closed by default) --> |
| <div id="MSearchResultsWindow"> |
| <iframe src="javascript:void(0)" frameborder="0" |
| name="MSearchResults" id="MSearchResults"> |
| </iframe> |
| </div> |
| |
| <div class="header"> |
| <div class="headertitle"> |
| <div class="title">Clustered Variance<div class="ingroups"><a class="el" href="group__grp__glm.html">Generalized Linear Models</a></div></div> </div> |
| </div><!--header--> |
| <div class="contents"> |
| <dl class="section user"><dt>About</dt><dd></dd></dl> |
| <p>Adjusting standard errors for clustering can be important. For example, replicating a dataset 100 times should not increase the precision of parameter estimates. However, performing this procedure with the IID assumption will actually do this. Another example is in economics of education research, it is reasonable to expect that the error terms for children in the same class are not independent. Clustering standard errors can correct for this.</p> |
| <p>Assume that the data can be separated into \(m\) clusters. Usually this can be done by grouping the data table according to one or multiple columns.</p> |
| <p>The estimator has a similar form to the usual sandwich estimator </p> |
| <p class="formulaDsp"> |
| \[ S(\vec{c}) = B(\vec{c}) M(\vec{c}) B(\vec{c}) \] |
| </p> |
| <p>The bread part is the same as Huber-White sandwich estimator </p> |
| <p class="formulaDsp"> |
| \begin{eqnarray} B(\vec{c}) & = & \left(-\sum_{i=1}^{n} H(y_i, \vec{x}_i, \vec{c})\right)^{-1}\\ & = & \left(-\sum_{i=1}^{n}\frac{\partial^2 l(y_i, \vec{x}_i, \vec{c})}{\partial c_\alpha \partial c_\beta}\right)^{-1} \end{eqnarray} |
| </p> |
| <p> where \(H\) is the hessian matrix, which is the second derivative of the target function </p> |
| <p class="formulaDsp"> |
| \[ L(\vec{c}) = \sum_{i=1}^n l(y_i, \vec{x}_i, \vec{c})\ . \] |
| </p> |
| <p>The meat part is different </p> |
| <p class="formulaDsp"> |
| \[ M(\vec{c}) = \bf{A}^T\bf{A} \] |
| </p> |
| <p> where the \(m\)-th row of \(\bf{A}\) is </p> |
| <p class="formulaDsp"> |
| \[ A_m = \sum_{i\in G_m}\frac{\partial l(y_i,\vec{x}_i,\vec{c})}{\partial \vec{c}} \] |
| </p> |
| <p> where \(G_m\) is the set of rows that belong to the same cluster.</p> |
| <p>We can compute the quantities of \(B\) and \(A\) for each cluster during one scan through the data table in an aggregate function. Then sum over all clusters to the full \(B\) and \(A\) in the outside of the aggregate function. At last, the matrix mulplitications are done in a separate function on the master node.</p> |
| <p>When multinomial logistic regression is computed before the multinomial clustered variance calculation, it uses a default reference category of zero and the regression coefficients are included in the output table. The regression coefficients in the output are in the same order as multinomial logistic regression function, which is described below. For a problem with \( K \) dependent variables \( (1, ..., K) \) and \( J \) categories \( (0, ..., J-1) \), let \( {m_{k,j}} \) denote the coefficient for dependent variable \( k \) and category \( j \). The output is \( {m_{k_1, j_0}, m_{k_1, j_1} \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1} \ldots m_{k_K, j_{J-1}}} \). The order is NOT CONSISTENT with the multinomial regression marginal effect calculation with function <em>marginal_mlogregr</em>. This is deliberate because the interfaces of all multinomial regressions (robust, clustered, ...) will be moved to match that used in marginal.</p> |
| <dl class="section user"><dt>Usage</dt><dd></dd></dl> |
| <dl class="section warning"><dt>Warning</dt><dd>The <b>'groupingCol'</b> input parameter for all clustered functions are a placeholder, and the <b>'verbose'</b> parameter is a placeholder for <em>clustered_variance_mlogregr</em>. These inputs will be implemented in a future release.</dd></dl> |
| <p><b> The clustered standard errors for linear regression </b></p> |
| <p>For a quick help message, run the following commands for linear regression </p> |
| <pre> |
| select madlib.clustered_variance_linregr(); |
| select madlib.clustered_variance_linregr('help'); |
| select madlib.clustered_variance_linregr('?'); |
| select madlib.clustered_variance_linregr('usage'); |
| </pre><p>For logistic regression, run the following commands to get short help messages inside psql </p> |
| <pre> |
| select madlib.clustered_variance_logregr(); |
| select madlib.clustered_variance_logregr('help'); |
| select madlib.clustered_variance_logregr('?'); |
| select madlib.clustered_variance_logregr('usage'); |
| </pre><p>For multinomial logistic regression, run the following commands to get short help messages inside psql </p> |
| <pre> |
| select madlib.clustered_variance_mlogregr(); |
| select madlib.clustered_variance_mlogregr('help'); |
| select madlib.clustered_variance_mlogregr('?'); |
| select madlib.clustered_variance_mlogregr('usage'); |
| </pre><pre> |
| SELECT madlib.clustered_variance_linregr ( |
| <em>'tbl_data'</em>, -- Data table name |
| <em>'tbl_output'</em>, -- The result table |
| <em>'depvar'</em>, -- An expression used as dependent variable |
| <em>'indvar'</em>, -- An expression used as independent variable |
| <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma |
| <em>'groupingvar'</em> -- The columns used as the grouping variables, separated by comma |
| ); |
| </pre><pre> |
| SELECT madlib.clustered_variance_logregr ( |
| <em>'tbl_data'</em>, -- Data table name |
| <em>'tbl_output'</em>, -- The result table |
| <em>'depvar'</em>, -- An expression used as dependent variable |
| <em>'indvar'</em>, -- An expression used as independent variable |
| <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma |
| <em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma |
| <em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20 |
| <em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls' |
| <em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than |
| -- this value, stops the computation. Default 0.0001 |
| <em>verbose</em> -- Whether print detailed information when computing logistic regression, |
| -- default is False |
| ); |
| </pre><pre> |
| SELECT madlib.clustered_variance_mlogregr ( |
| <em>'tbl_data'</em>, -- Data table name |
| <em>'tbl_output'</em>, -- The result table |
| <em>'depvar'</em>, -- An expression used as dependent variable |
| <em>'indvar'</em>, -- An expression used as independent variable |
| <em>'clustervar'</em>, -- The columns used as the cluster variables, separated by comma |
| <em>ref_category</em>, -- Reference category in the range of [0, num_category) |
| <em>'groupingvar'</em>, -- The columns used as the grouping variables, separated by comma |
| <em>max_iter</em>, -- Maximum iteration number for logistic regression, default 20 |
| <em>'optimizer'</em>, -- Optimization method for logistic regression, default 'irls' |
| <em>tolerance</em>, -- When difference of likelihoods in two consecutive iterations smaller than |
| -- this value, stops the computation. Default 0.0001 |
| <em>verbose</em> -- Whether print detailed information when computing logistic regression, |
| -- default is False |
| ); |
| </pre><dl class="section user"><dt>Examples</dt><dd></dd></dl> |
| <p>Note that we need to manually include an intercept term in the independent variable expression. The NULL value of <em>groupingvar</em> means that there is no grouping in the calculation.</p> |
| <pre class="fragment">sql> drop table if exists tbl_output; |
| sql> select madlib.clustered_variance_linregr ('abalone', 'tbl_output', 'rings', 'array[1, diameter, length, width]', 'sex', NULL); |
| sql> select * from tbl_output; |
| sql> ---------------------------------------------- |
| sql> drop table if exists tbl_output; |
| sql> select madlib.clustered_variance_logregr ('abalone', 'tbl_output', 'rings < 10', 'array[1, diameter, length, width]', 'sex'); |
| sql> select * from tbl_output; |
| sql> ---------------------------------------------- |
| sql> drop table if exists tbl_output; |
| sql> select madlib.clustered_variance_mlogregr ('abalone', 'tbl_output', 'case when rings < 10 then 1 else 0 end', 'array[1, diameter, length, width]', 'sex', 0); |
| sql> select * from tbl_output; |
| </pre><dl class="section user"><dt>Literature</dt><dd></dd></dl> |
| <p>[1] Standard, Robust, and Clustered Standard Errors Computed in R, <a href="http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/">http://diffuseprior.wordpress.com/2012/06/15/standard-robust-and-clustered-standard-errors-computed-in-r/</a></p> |
| <dl class="section see"><dt>See Also</dt><dd>File <a class="el" href="clustered__variance_8sql__in.html">clustered_variance.sql_in</a> documenting the SQL function </dd></dl> |
| </div><!-- contents --> |
| </div><!-- doc-content --> |
| <!-- start footer part --> |
| <div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> |
| <ul> |
| <li class="footer">Generated on Thu Jan 9 2014 20:35:40 for MADlib by |
| <a href="http://www.doxygen.org/index.html"> |
| <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li> |
| </ul> |
| </div> |
| </body> |
| </html> |