blob: cf1220fd18b46bab667c39d28345d91c4cafd78c [file] [log] [blame]
% When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
% !TEX root = ../design.tex
\chapter[Sandwich Estimators]{Sandwich Estimators}
\section{Introduction}
Given a regression setup of $n$ data points, each defined by a feature vector $x_i$ and a category $y_i$, we assume that $y_i$ is controlled by a $k$-dimensional parameter vector $\theta$. Generally, we are interested in finding the values of $\theta$ that best predict $y_i$ from $x_i$, with \textit{best} being defined as the values that maximize some likelihood function $L(y,x,\theta)$. The maximization is typically solved using the derivative of the likelihood $\psi$ and the Hessian $H$. More formally, $\psi$ is defined as
\begin{align}
\psi(y_i,x_i, \theta) = \frac{\partial l(x_i,y_i,\theta)}{\partial \theta}
\end{align}
and $H$ is defined as
\begin{align}
H(y,x, \theta) = \frac{\partial^2 L(x,y,\theta)}{\partial \theta^2}.
\end{align}
In addition to the values of $\theta$, we may also be interested in the covariance matrix $S(\theta)$ of $\theta$. This can be expressed in a \textit{sandwich formulation}, of the form
\begin{align}
S(\theta) = B(\theta) M(\theta) B(\theta).
\end{align}
The $B(\theta)$ matrix is commonly called the \textit{bread}, whereas the $M(\theta)$ matrix is the \textit{meat}.
\subsection{The Bread}
Computing $B$ is relatively straightforward,
\begin{align}
B(\theta) = n\left(\sum_i^n -H(y_i, x_i, \theta) \right)^{-1}
\end{align}
\subsection{The Meat}
There are several choices for the $M$ matrix, each with different robustness properties. The estimators we are interested in for this implementation are the Huber/White estimator, and the clustered estimator.
In the Huber/White estimator, the matrix $M$ is defined as
\begin{align}
M_{H} = \sum_i^n \psi(y_i,x_i, \theta)^T \psi(y_i,x_i, \theta).
\end{align}
%The Huber/White estimator is a diagonal matrix defined as
%\begin{align}
%M_W =\frac{1}{n} X^T \left(
% \begin{array}{cccc}
% r(y_1, x_1^T\theta) & 0&\ldots &0 \\
% 0&r(y_2, x_2^T\theta) &\dots& \vdots \\
% \vdots & \vdots & \ddots&\vdots \\
% 0 & \ldots & \ldots &r(y_n, x_n^T\theta)
% \end{array} \right) X.
%\end{align}
%The matrix $X$ is the data matrix, and $r$ is the residual function. The residual function is defined by the application, but it has the relationship
%\begin{align}
% \psi(y_i,x_i, \theta) = r(y_i, x_i^T\theta)x_i.
%\end{align}
%This means that it can be computed in the general case as
%\begin{align}
% r(y_i, x_i^T\theta) = \frac{\psi(y_i,x_i, \theta)}{x_i}
%\end{align}
%
%In the case of the clustered sandwich estimator, $M$ is defined as
%In addition to the computational expense of computing the Hessian, the inverse Hessian must be computed as well. Unfortunately, matrix inversion is an expensive operation, and cannot be parallelized.