doc/design/modules/sandwich-estimators.tex - madlib - Git at Google

 % When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
 % !TEX root = ../design.tex

 \chapter[Sandwich Estimators]{Sandwich Estimators}


 \section{Introduction}
 Given a regression setup of $n$ data points, each defined by a feature  vector $x_i$ and a category $y_i$, we assume that $y_i$ is controlled by a $k$-dimensional parameter vector $\theta$.  Generally, we are interested in finding the values of $\theta$ that best predict $y_i$ from $x_i$, with \textit{best} being defined as the values that maximize some likelihood function $L(y,x,\theta)$.  The maximization is typically solved using the derivative of the likelihood $\psi$  and the Hessian $H$.  More formally, $\psi$ is defined as
 \begin{align}
   \psi(y_i,x_i, \theta) = \frac{\partial l(x_i,y_i,\theta)}{\partial \theta}
 \end{align}
 and $H$ is defined as
 \begin{align}
 H(y,x, \theta) = \frac{\partial^2 L(x,y,\theta)}{\partial \theta^2}.
 \end{align}


 In addition to the values of $\theta$, we may also be interested in the covariance matrix $S(\theta)$ of $\theta$.  This can be expressed in a \textit{sandwich formulation}, of the form
 \begin{align}
 S(\theta) = B(\theta) M(\theta) B(\theta).
 \end{align}
 The $B(\theta)$ matrix is commonly called the \textit{bread}, whereas the $M(\theta)$ matrix is the \textit{meat}.

 \subsection{The Bread}
 Computing $B$ is relatively straightforward,
 \begin{align}
 B(\theta) = n\left(\sum_i^n -H(y_i, x_i, \theta) \right)^{-1}
 \end{align}

 \subsection{The Meat}
 There are several choices for the $M$ matrix, each with different robustness properties.  The estimators we are interested in for this implementation are the Huber/White estimator, and the clustered estimator.

 In the Huber/White estimator, the matrix $M$ is defined as
 \begin{align}
 M_{H} = \sum_i^n \psi(y_i,x_i, \theta)^T  \psi(y_i,x_i, \theta).
 \end{align}

 %The Huber/White estimator is a diagonal matrix defined as
 %\begin{align}
 %M_W =\frac{1}{n} X^T \left(
 %      \begin{array}{cccc}
 %        r(y_1, x_1^T\theta) & 0&\ldots &0 \\
 %        0&r(y_2, x_2^T\theta) &\dots& \vdots \\
 %	\vdots & \vdots & \ddots&\vdots \\
 %        0 & \ldots & \ldots &r(y_n, x_n^T\theta)
 %      \end{array} \right) X.
 %\end{align}
 %The matrix $X$ is the data matrix, and $r$ is the residual function.  The residual function is defined by the application, but it has the relationship
 %\begin{align}
 %  \psi(y_i,x_i, \theta) = r(y_i, x_i^T\theta)x_i.
 %\end{align}
 %This means that it can be computed in the general case as
 %\begin{align}
 % r(y_i, x_i^T\theta) =  \frac{\psi(y_i,x_i, \theta)}{x_i}
 %\end{align}
 %
 %In the case of the clustered sandwich estimator, $M$ is defined as


 %In addition to the computational expense of computing the Hessian, the inverse Hessian must be computed as well.  Unfortunately, matrix inversion is an expensive operation, and  cannot be parallelized.
	% When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
	% !TEX root = ../design.tex

	\chapter[Sandwich Estimators]{Sandwich Estimators}


	\section{Introduction}
	Given a regression setup of $n$ data points, each defined by a feature vector $x_i$ and a category $y_i$, we assume that $y_i$ is controlled by a $k$-dimensional parameter vector $\theta$. Generally, we are interested in finding the values of $\theta$ that best predict $y_i$ from $x_i$, with \textit{best} being defined as the values that maximize some likelihood function $L(y,x,\theta)$. The maximization is typically solved using the derivative of the likelihood $\psi$ and the Hessian $H$. More formally, $\psi$ is defined as
	\begin{align}
	\psi(y_i,x_i, \theta) = \frac{\partial l(x_i,y_i,\theta)}{\partial \theta}
	\end{align}
	and $H$ is defined as
	\begin{align}
	H(y,x, \theta) = \frac{\partial^2 L(x,y,\theta)}{\partial \theta^2}.
	\end{align}



	In addition to the values of $\theta$, we may also be interested in the covariance matrix $S(\theta)$ of $\theta$. This can be expressed in a \textit{sandwich formulation}, of the form
	\begin{align}
	S(\theta) = B(\theta) M(\theta) B(\theta).
	\end{align}
	The $B(\theta)$ matrix is commonly called the \textit{bread}, whereas the $M(\theta)$ matrix is the \textit{meat}.

	\subsection{The Bread}
	Computing $B$ is relatively straightforward,
	\begin{align}
	B(\theta) = n\left(\sum_i^n -H(y_i, x_i, \theta) \right)^{-1}
	\end{align}

	\subsection{The Meat}
	There are several choices for the $M$ matrix, each with different robustness properties. The estimators we are interested in for this implementation are the Huber/White estimator, and the clustered estimator.

	In the Huber/White estimator, the matrix $M$ is defined as
	\begin{align}
	M_{H} = \sum_i^n \psi(y_i,x_i, \theta)^T \psi(y_i,x_i, \theta).
	\end{align}

	%The Huber/White estimator is a diagonal matrix defined as
	%\begin{align}
	%M_W =\frac{1}{n} X^T \left(
	% \begin{array}{cccc}
	% r(y_1, x_1^T\theta) & 0&\ldots &0 \\
	% 0&r(y_2, x_2^T\theta) &\dots& \vdots \\
	% \vdots & \vdots & \ddots&\vdots \\
	% 0 & \ldots & \ldots &r(y_n, x_n^T\theta)
	% \end{array} \right) X.
	%\end{align}
	%The matrix $X$ is the data matrix, and $r$ is the residual function. The residual function is defined by the application, but it has the relationship
	%\begin{align}
	% \psi(y_i,x_i, \theta) = r(y_i, x_i^T\theta)x_i.
	%\end{align}
	%This means that it can be computed in the general case as
	%\begin{align}
	% r(y_i, x_i^T\theta) = \frac{\psi(y_i,x_i, \theta)}{x_i}
	%\end{align}
	%
	%In the case of the clustered sandwich estimator, $M$ is defined as




	%In addition to the computational expense of computing the Hessian, the inverse Hessian must be computed as well. Unfortunately, matrix inversion is an expensive operation, and cannot be parallelized.