blob: e241f9ab1f8b821cd232f89e49d228f31d4ac0ab [file] [log] [blame]
% When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
%!TEX root = ../design.tex
\chapter[Regression]{Regression}
\label{chap:regression}
\begin{moduleinfo}
\item[Authors] {Rahul Iyer and Hai Qian}
\item[History]
\begin{modulehistory}
\item[v0.3] Added section on Clustered Sandwich Estimators
\item[v0.2] Added section on Marginal Effects
\item[v0.1] Initial version, including background of regularization
\end{modulehistory}
\end{moduleinfo}
\newcommand{\bS}[1]{\boldsymbol{#1}}
% Abstract. What is the problem we want to solve?
Regression analysis is a statistical tool for the investigation of
relationships between variables. Usually, the investigator seeks to ascertain
the causal effect of one variable upon another - the effect of a price increase
upon demand, for example, or the effect of changes in the money supply upon the
inflation rate. More specifically, regression analysis helps one understand how
the typical value of the dependent variable changes when any one of the
independent variables is varied, while the other independent variables are held
fixed.
Regression models involve the following variables:
\begin{enumerate}
\item The unknown parameters, denoted as $\beta$, which may represent a scalar or a vector.
\item The independent variables, $x$
\item The dependent variables, $y$
\end{enumerate}
%\section{Linear Methods for Regression} % (fold)
%\label{sub:linear_methods_for_regression}
% subsection linear_methods_for_regression (end)
\section{Multinomial Logistic Regression}\label{sec:multi_intro}
Multinomial logistic regression is a widely used regression analysis tool that
models the outcomes of categorical dependent random variables.
\textit{Generalized linear models} identify key ideas shared by a broad class of
distributions thereby extending techniques used in linear regression, into the
field of logistic regression.
This document provides an overview of the theory of multinomial logistic
regression models followed by a design specification of how the model can be
estimated using maximum likelihood estimators. In the final section, we outline
a specific implementation of the algorithm that estimates multinomial logistic
regression models on large datasets.
\subsection{Problem Description}\label{sec:multi_problem}
In this section, we setup the notation for a generic multinomial regression
problem. Let us consider an $N$-dimensional multinomial random variable $\bS{Z}$
that can take values in $J$ different categories, where $J \geq 2$. As input to
the problem, we are given an $N\times J$ matrix of observations $\bS{y}$. Here
$y_{i,j}$ denotes the observed value for the $j^{th}$ category of the random
variable $Z_i$. Analogous to the observed values, we define a set of parameters
$\bS{\pi}$ as an $N\times J$ matrix with each entry $\pi_{i,j}$ denoting the
probability of observing the random variable $Z_i$ to fall in the $j^{th}$
category. In logistic regression, we assume that the random variables $\bS{Z}$
are explained by a \textit{design matrix} of independent random variables $\bS{X}$
which contains $N$ rows and $(K+1)$ columns. We define a regression coefficient
$\bS{\beta}$ as a matrix with $K+1$ rows and $J$ columns such that $\beta_{k,j}$
corresponds to the importance while predicting the $j^{th}$ value of the
$k^{th}$ explanatory variable.
For the multinomial regression model, we assume the observations $\bS{y}$ are
realizations of random variables $\bS{Z}$ which are explained using random
variables $\bS{X}$ and parameters $\bS{\beta}$. More specifically, if we
consider the $J^{th}$ category to be the `pivot' or `baseline' category, then
the log of the odds of an observation compared to the $J^{th}$ observation can
be predicted using a linear function of variables $\bS{X}$ and parameters
$\bS{\beta}$.
\begin{equation}
\log \Big( \frac{\pi_{i,j}}{\pi_{i,J}}\Big)
= \log \Big( \frac{\pi_{i,j}}{\sum_{j=1}^{J-1}\pi_{i,j}}\Big)
= \sum_{k=0}^{K} x_{i,k}\beta_{k,j}
% \ \ \ \ \ \aI, \aJ
\end{equation}
Solving for $\pi_{i,j}$, we have
\begin{subequations}
\label{eq:pi_def}
\begin{align}
\pi_{i,j} &= \frac{\exp \big( {\sum_{k=0}^{K} x_{i,k}\beta_{k,j}}\big)}{1 + \sum_{j=1}^{J-1}\exp{\big( \sum_{k=0}^{K} x_{i,k}\beta_{k,j}\big)}}\ \ \ \ \forall j < J\\
\pi_{i,J} &= \frac{1}{1 + \sum_{j=1}^{J-1}\exp{\big( \sum_{k=0}^{K} x_{i,k}\beta_{k,j}\big)}}
\end{align}
\end{subequations}
In a sentence, the goal of multinomial regression is to use observations
$\bS{y}$ to estimate parameters $\bS{\beta}$ that can predict random variables
$\bS{Z}$ using explanatory variables $\bS{X}$.
\subsection{Parameter Estimation}
We evaluate the parameters $\bS{\beta}$ using a maximum-likelihood estimator
(MLE) which maximizes the likelihood that a certain set of parameters predict
the given set of observations. For this, we define the following likelihood
function:
\begin{align}
L(\bS{\beta}|\bS{y}) &\simeq \prod_{i=1}^{N}\prod_{j=1}^{J}{\pi_{i,j}}^{y_{i,j}} \\
\intertext{Substituting \eqref{eq:pi_def} in the above expression, we have}
&= \prod_{i=1}^{N}\prod_{j=1}^{J-1} \Big( 1 + \sum_{j=1}^{J-1} e^{\sum_{k=0}^{K} x_{i,k}\beta_{k,j}} \Big) e^{y_{i,j} \sum_{k=0}^{K} x_{i,k}\beta_{k,j}} \\
\intertext{Taking the natural logarithm of $L(\bS{\beta}|\bS{y})$ defined above, we derive the following expression for the log-likelihood function, $l(\bS{\beta})$ as:}
l(\bS{\beta}) &= \sum_{i=1}^{N}\sum_{j=1}^{N} \Big( y_{i,j} \sum_{k=0}^{K} x_{i,k}\beta_{k,j} \Big) - \log \Big( 1 + \sum_{j=1}^{J-1} \sum_{k=0}^{K} x_{i,k}\beta_{k,j}\Big) \label{eq:log_lik}
\end{align}
The maximum likelihood estimator tries maximize the log-likelihood function as
defined in Equation \eqref{eq:log_lik}. Unlike linear regression, the MLE has to
be obtained numerically. Since we plan to implement derivative based algorithms
to solve $\max_{\bS{\beta}} l(\bS{\beta})$, we first derive expressions for the
first and second derivatives of the log-likelihood function.
We differentiate \eqref{eq:log_lik} with respect to each parameter $\beta_{k,j}$
\begin{equation}\label{eq:first_derivative}
\frac{\partial l(\bS{\beta})}{\partial \beta_{k,j}} = \sum_{i=1}^{N} y_{i,j}x_{i,k} - \pi_{i,j}x_{i,k} \ \ \ \ \forall k \ \forall j
\end{equation}
We evaluate the extreme point of the function $l(\bS{\beta})$ by setting each
equation of \eqref{eq:first_derivative} to zero. We proceed on similar lines to
derive the second order derivative of the $l(\bS{\beta})$ with respect to two
parameters $\beta_{k_1,j_1}$ and $\beta_{k_2,j_2}$
\begin{subequations}
\begin{align}\label{eq:second_derivative}
\frac{\partial^2 l(\bS{\beta})}{\partial \beta_{k_2,j_2} \partial \beta_{k_1,j_1}}
&= \sum_{i=1}^{N} -\pi_{i,j_2}x_{i,k_2}(1-\pi_{i,j_1})x_{i,k_1} &&j_1 = j_2 \\
&= \sum_{i=1}^{N} \pi_{i,j_2}x_{i,k_2}\pi_{i,j_1}x_{i,k_1} &&j_1 \neq j_2
\end{align}
\end{subequations}
\subsection{Algorithms}
Newton's method is a numerical recipe for root finding of non-linear functions.
We apply this method to solve all nonlinear equations produced by setting
\eqref{eq:first_derivative} to zero. Newton's method begins with an initial
guess for the solution after which it uses the Taylor series approximation of
function at the current iterate to produce another estimate that might be closer
to the true solution. This iterative procedure has one of the fastest
theoretical rates of convergence in terms of number of iterations, but requires
second derivative information to be computed at each iteration which is a lot
more work than other light-weight derivative based methods.
Newton method can be compactly described using the update step. Let us assume
$\bS{\beta^0}$ to be the initial guess for the MLE (denoted by $MLE$). If the
$\bS{\beta^I}$ is the `guess' for $MLE$ at the $I^{th}$ iteration, then we can
evaluate $\bS{\beta^{I+1}}$ using
\begin{align}\label{eq:update}
\bS{\beta^{I+1}} &= \bS{\beta^{I}} - [l''(\bS{\beta^{I}})]^{-1} \ l'(\bS{\beta^{I}}) \\
&= \bS{\beta^{I}} - [l''(\bS{\beta^{I}})]^{-1} \bS{X}^{T} (\bS{y} - \bS{\pi})
\end{align}
where $\bS{X}^{T} (\bS{y} - \bS{\pi})$ is matrix notation for the first order
derivative. The newton method might have proven advantage in terms of number of
iterations on small problem sizes but it might not scale well to larger sizes
because it requires an expensive step for the matrix inversion of the second
order derivative.
As an aside, we observe that Equation \eqref{eq:first_derivative} and
\eqref{eq:second_derivative} refers to the first derivative in matrix form and
the second derivative in tensor form respectively. In the implementation phase,
we work with `vectorized' versions of $\bS{\beta},\bS{X},\bS{y}, \bS{\pi}$
denoted by $\vec{\beta},\vec{X},\vec{y}, \vec{\pi}$ respectively where the
matrix are stacked up together in row major format.
Using this notation, we can rewrite the first derivative in \eqref{eq:first_derivative} as:
\begin{equation}\label{eq:first_derivative_vectorized}
\frac{\partial l(\bS{\beta})}{\partial \vec{\beta}} = \vec{X}^{T} (\vec{y} - \vec{\pi})
\end{equation}
Similarly, we can rewrite the second derivative in \eqref{eq:second_derivative} as:
\begin{equation}\label{eq:second_derivative_vectorized}
\frac{\partial^2 l(\bS{\beta})}{\partial^2 \vec{\beta}} = \vec{X}^{T} W \vec{X}
\end{equation}
where $W$ is a diagonal matrix of dimension $(K+1) \times J $ where the diagonal
elements are set $\pi_{i,j_1}\pi_{i,j_2}$ if $j_1 \neq j_2$ or
$\pi_{i,j_1}(1-\pi_{i,j_2})$ otherwise. Note that
\eqref{eq:second_derivative_vectorized} is merely a compact way to write
\eqref{eq:second_derivative}.
The Newton method procedure is illustrated in Algorithm \ref{alg:Newton}.
\begin{algorithm}
% [Newton's Method] \label{alg:newton}
% \caption{Newton's Method}
\alginput{$\vec{X},\vec{Y}$ and an initial guess for parameters $\vec{\beta^{0}}$}
\algoutput{The maximum likelihood estimator $\beta_{MLE}$}
\begin{algorithmic}[1]
\State $I \leftarrow 0$
\Repeat
\State Diagonal Weight matrix $W$: $w_{j_1,j_2} \leftarrow \pi_{i,j_1}\pi_{i,j_2}$ if $j_1 \neq j_2$ or $\pi_{i,j_1}(1-\pi_{i,j_2})$ otherwise
\State Compute $\vec{\beta}^{I+1}$ using:
\State $\vec{\beta}^{I+1} =\vec{\beta}^{I} - (\vec{X}^{T} W \vec{X})^{-1} \vec{X}^{T} (\vec{y} - \vec{\pi}) $
\Until{$\vec{\beta}^{I+1}$ converges}
\end{algorithmic}
\label{alg:Newton}
\end{algorithm}
\subsection{Common Statistics}\label{sec:stats}
Irrespective of the solver that we choose to implement, we would need to calculate the standard errors and p-value.
Asymptotics tells that the MLE is an asymptotically efficient estimator. This means that it reaches the following Cramer-Rao lower bound:
\begin{equation}
\sqrt{n}(\beta_{MLE} - \beta) \rightarrow \mathcal{N}(0,I^{-1})
\end{equation}
where $I^{-1}$ is the Fisher information matrix. Hence, we evaluate the standard errors using the asymptotic variance of $i^{th}$ parameter (in vector form) of the MLE as:
\begin{equation}
se(\vec{\beta}_i) = (\vec{X}^{T} W \vec{X})^{-1}_{i}
\end{equation}
The Wald statistic is used to assess the significance of coefficients $\beta$.
In the Wald test, the MLE is compared with the null hypothesis while assuming
that the difference between them is approximately normally distributed. The Wald
p-value for a coefficient provides us with the probability of seeing a value as
extreme as the one observed, when null hypothesis is true. We evaluate the Wald
p-value as:
\begin{equation}
p_i = \mbox{Pr}\Big(|Z| \geq \big|\frac{\vec{\beta}_i}{se(\vec{\beta}_i)} \big| \Big) = 2 \Big[1 - F(se(\vec{\beta}_i)) \Big]
\end{equation}
where $Z$ is a normally distributed random variable and $F$ represents the cdf of $Z$.
\section{Implementation}\label{sec:implem}
For the implementation, we plan to mimic the framework used for the current
implementation of the logistic regression. In this framework, the regression is
broken up into 3 steps, denoted as the \textit{transition} step, the
\textit{merge states} step, and the \textit{final} step. Much of the
computation is done in parallel, and each transition step operates on a small
number of rows in the data matrix to compute a portion of the calculation for
$\vec{X}^T W\vec{X}$, as well as the calculation for $\vec{X}^TW\vec{\alpha}$.
This is used in the Hessian calculation in equation
\ref{eq:second_derivative_vectorized}.
The merge step aggregates these transition steps together. This consists of
summing the $X^T WX$ calculations, summing the $\vec{X}^TW\vec{\alpha}$
calculations, and updating the bookkeeping. The final step computes the
current solution $\vec{\beta}$, and returns it and its associated statistics.
The three steps in the framework communicate by passing around \textit{state} objects. Each state object carries with it several fields:
\begin{description}
\item [coef] This is the column vector containing the current solution. This is the $\beta$ part of $\beta^TX$ with the exception of $\beta_0$.
\item [beta] The scalar constant term $\beta_0$ in $\beta^TX$.
\item [widthOfX] The number of features in the data matrix.
\item [numRows] The number of data points being operated on.
\item [dir] The direction of the gradient it the $k$th step. Not sure about this one.
\item [grad] The gradient vector in the $k$th step.
\item [gradNew] The gradient vector in the $x+1$th step.
\item [X\_transp\_AX] The current sum of $\vec{X}^T W\vec{X}$.
\item [X\_transp\_Az] The current sum of $\vec{X}^TW\vec{\alpha}$.
\item [logLikelihood] The log likelihood of the solution.
\end{description}
Each transition step is given an initial state, and returns a state with the updated X\_transp\_AX and X\_transp\_Az fields. These states are iteratively passed to a merge step, which combines them two at a time. The final product is then passed to the final step. We expect to use the same system, or something similar.
We can formalize the API's for these three steps as:
\begin{verbatim}
multilogregr_irls_step_transition(AnyType *Args)
multilogregr_irls_step_merge(AnyType *Args)
multilogregr_irls_step_final(AnyType *Args)
\end{verbatim}
The \texttt{AnyType} object is a generic type used in madlib to retrieve and return values from the backend. Among other things, an \texttt{AnyType} object can be a \texttt{NULL} value, a scalar, or a state object.
The first step, \texttt{multilogregr\_cg\_step\_transition}, will expect \texttt{*Args} to contains the following items in the following order: a state object for the current state, a vector $x$ containing a row of the design matrix, a vector $y$ containing the values of $\bS{Z}$ for this row, and a state object for the previous state. The return value for this function will be another state object.
The second step \texttt{multilogregr\_cg\_step\_merge} will expect \texttt{*Args} to contain two state objects, and will return a state object expressing the merging of the two input objects. The final step \texttt{multilogregr\_cg\_step\_final} expects a single state object, and returns an \texttt{AnyType} object containing the solution's coefficients, the standard error, and the solution's p values.
\section{Regularization} % (fold)
\label{sub:regularization}
Usually, $y$ is the result of measurements contaminated by small errors (noise).
Frequently, ill-conditioned or singular systems also arise in the iterative
solution of nonlinear systems or optimization problems. In all such situations,
the vector $x = {A}^{-1}y$ (or in the full rank overdetermined case $A^+ y$,
with the pseudo inverse $A^+ = (A^T A)^{-1}A^T X)$, if it exists at all, is
usually a meaningless bad approximation to x.
Regularization techniques are needed to obtain meaningful solution estimates
for such ill-posed problems, and in particular when the number of parameters
is larger than the number of available measurements, so that standard least
squares techniques break down.
\subsection{Linear Ridge Regression}
Ridge regression is the most commonly used method of regularization of
ill-posed problems. Mathematically, it seeks to minimize
\begin{equation}
Q\left(\vec{w},w_0;\lambda\right)\equiv \min_{\vec{w},w_0}\left[ \frac{1}{2N} \sum_{i=1}^{N} \left( y_i - w_0 -
\vec{w} \cdot \vec{x}_i \right)^2
+\frac{\lambda}{2}\|\vec{w}\|_2^2 \right]\ ,
\end{equation}
for a given value of $\lambda$, where $\vec{w}$ and $w_0$ are the fitting coefficients, and $\lambda$
is a non-negative regularization parameter. $\vec{w}$ is a vector in
$d$ dimensional space, and
\begin{equation}
\|\vec{w}\|_2^2 = \sum_{j=1}^{d}w_j^2 = \vec{w}^T\vec{w}\ .
\end{equation}
When $\lambda = 0$, $Q$ is
the mean squared error of the fitting.
The intercept term $w_0$ is not regularized, because this term is
fully decided by the mean values of $y_i$ and $\vec{x}_i$ and the
values of $\vec{w}$, and does not affect the model's complexity.
$Q\left(\vec{w},w_0;\lambda\right)$ is a quadratic function of $\vec{w}$ and
$w_0$, and thus can be solved analytically
\begin{equation}
\vec{w}_{ridge}=\left(\lambda\vec{I}_d +
\vec{X}^T\vec{X}\right)^{-1}\vec{X}^T\vec{y}\ .
\end{equation}
By using the available Newton method (Sec. 6.2.4), the above quantity can be easily
calculated from one single step of the Newton method.
Many packages for Ridge regularization actually regularize the fitting
coefficients not for the fitting model for the original data but for
the data that has be scaled. MADlib also provides this option. When
the normalization parameter is set to be True, which is by default
False, the data will be first converted to the following before
applying the Ridge regularization.
\begin{equation}
x_i' \leftarrow \frac{x_i - \langle x_i \rangle}{\langle (x_i -
\langle x_i \rangle)^2\rangle} \ ,
\end{equation}
\begin{equation}
y_i \leftarrow y_i - \langle y_i \rangle \ ,
\end{equation}
where $\langle \cdot \rangle = \sum_{i=1}^{N} \cdot / N$.
Note that Ridge regressions for scaled data and un-scaled data are not equivalent.
\subsection{Elastic Net Regularization} % (fold)
\label{ssub:elastic_net_regularization}
As a continuous shrinkage method, ridge regression achieves its better
prediction performance through a bias-variance trade-off. However, ridge
regression cannot produce a parsimonious model, for it always keeps all the
predictors in the model~\cite{zou2005}. Best subset selection in contrast
produces a sparse model, but it is extremely variable because of its inherent
discreteness.
A promising technique called the lasso was proposed by Tibshirani (1996). The
lasso is a penalized least squares method imposing an L1-penalty on the
regression coefficients. Owing to the nature of the L1-penalty, the lasso does
both continuous shrinkage and automatic variable selection simultaneously.
Although the lasso has shown success in many situations, it has some
limitations. Consider the following three scenarios:
\begin{enumerate}
\item In the `Number of features' ($p$) >> `Number of observations' ($n$) case,
the lasso selects at most $n$ variables before it saturates, because of the
nature of the convex optimization problem. This seems to be a limiting
feature for a variable selection method. Moreover, the lasso is not well
defined unless the bound on the $L_1$-norm of the coefficients is smaller
than a certain value.
\item If there is a group of variables among which the pairwise correlations are
very high, then the lasso tends to select only one variable from the group
and does not care which one is selected.
\item For usual n>p situations, if there are high correlations between predictors,
it has been empirically observed that the prediction performance of the
lasso is dominated by ridge regression.
\end{enumerate}
These scenarios make lasso an inappropriate variable selection method in some situations.
Hui Zou and Trevor Hastie [42] introduce a new regularization
technique called the `elastic net'. Similar to the lasso, the elastic net
simultaneously does automatic variable selection and continuous
shrinkage, and it can select groups of correlated variables. It is like a
stretchable fishing net that retains `all the big fish'.
The elastic net regularization minimizes the following target function
\begin{equation} \label{eq:target}
\min_{\vec{w} \in R^N}L(\vec{w}) + \lambda \left[\frac{1-\alpha}{2}\|\vec{w}\|_2^2 +
\lambda\alpha \|\vec{w}\|_1\right]\ ,
\end{equation}
where $\|\vec{w}\|_1 = \sum_{i=1}^N|w_i|$ and $N$ is the number of features.
For the elastic net regularization on linear models,
\begin{equation}
L(\vec{w}) = \frac{1}{2M}\sum_{m=1}^M\left(y_m - w_0 - \vec{w} \cdot
\vec{x}_m\right)^2\ ,
\end{equation}
where the sum is over all observations and $M$ is the total number of
observation.
For the elastic net regularization on logistic models,
\begin{equation}
L(\vec{w}) = \sum_{m=1}^M\left[y_m \log\left(1 + e^{-(w_0 +
\vec{w}\cdot\vec{x}_m)}\right) + (1-y_m) \log\left(1 + e^{w_0 +
\vec{w}\cdot\vec{x}_m}\right)\right]\ ,
\end{equation}
where $y_m \in \{0,1\}$.
\subsubsection{Optimizer Algorithms}
Right now, we support two algorithms for optimizer. The default one is
FISTA, and the other is IGD.
\paragraph{FISTA}
Fast Iterative Shrinkage Thresholding Algorithm (FISTA) with
\textbf{backtracking step size} [4]:
\vspace{0.2in}
\hrule
\vspace{0.2in}
\textbf{Step $0$}: Choose $\delta>0$ and $\eta > 1$, and
$\vec{w}^{(0)} \in R^N$. Set $\vec{v}^{(1)}=\vec{w}^{(0)}$ and
$t_1=1$.
\textbf{Step $k$}: ($k \ge 1$) Find the smallest nonnegative integers
$i_k$ such that with $\bar{\delta} = \delta_{k-1}/\eta^{i_k-1}$
\begin{equation}
F(p_{\bar{\delta}}(\vec{v}^{(k)})) \le
Q_{\bar{\delta}}(p_{\bar{\delta}}(\vec{v}^{(k)}), \vec{v}^{k})\ .
\end{equation}
Set $\delta_k = \delta_{k-1}/\eta^{i_k-1}$ and compute
\begin{equation}
\vec{w}^{(k)} = p_{\delta_k}\left(\vec{v}^{(k)}\right)\ ,
\end{equation}
\begin{equation}
t_{k+1} = \frac{1 + \sqrt{1 + 4t_k^2}}{2}\ ,
\end{equation}
\begin{equation}
\vec{v}^{(k+1)} = \vec{w}^{(k)} +
\frac{t_k-1}{t_{k+1}}\left(\vec{w}^{(k)} - \vec{w}^{(k-1)}\right)\ .
\end{equation}
\vspace{0.2in}
\hrule
\vspace{0.2in}
Here,
\begin{equation}
F(\vec{w}) = f(\vec{w}) + g(\vec{w})\ ,
\end{equation}
where $f(\vec{w})$ is the differentiable part of
Eq. (\ref{eq:target}) and $g(\vec{w})$ is the non-differentiable part.
For linear models,
\begin{equation}
f(\vec{w}) = \frac{1}{2M}\sum_{m=1}^M\left(y_m - w_0 - \vec{w} \cdot
\vec{x}_m\right)^2 + \frac{\lambda(1-\alpha)}{2}\|\vec{w}\|_2^2\ ,
\end{equation}
and for logistic models,
\begin{equation}
f(\vec{w}) = \sum_{m=1}^M\left[y_m \log\left(1 + e^{-(w_0 +
\vec{w}\cdot\vec{x}_m)}\right) + (1-y_m) \log\left(1 + e^{w_0 +
\vec{w}\cdot\vec{x}_m}\right)\right] + \frac{\lambda(1-\alpha)}{2}\|\vec{w}\|_2^2\ .
\end{equation}
And for both types of models,
\begin{equation}
g(\vec{w}) = \lambda\alpha\sum_{i=1}^N|w_i|\ .
\end{equation}
And
\begin{equation}
Q_{\delta}(\vec{a}, \vec{b}) := f(\vec{b}) + \langle \vec{a} -
\vec{b}, \nabla f(\vec{b})\rangle +
\frac{1}{2\delta}\|\vec{a} - \vec{b}\|^2 + g(\vec{a})\ ,
\end{equation}
where $\langle\cdot\rangle$ is just the usual vector dot product.
And the proxy function is defined as
\begin{equation}
p_\delta (\vec{v}) := \underset{\vec{w}}{\operatorname{arg\,min}} \left[g(\vec{w}) +
\frac{1}{2\delta}\left\|\vec{w} - \left(\vec{v} - \delta\,\nabla f(\vec{v})\right)\right\|^2 \right]
\end{equation}
For our case, where $g(\vec{w}) = \lambda\alpha\sum_{i=1}^N|w_i|$, the
proxy function is simply equal to the soft-thresholding function
\begin{equation}
p_\delta (v_i) = \left\{ \begin{array}{ll}
v_i - \lambda\alpha\delta u_i\ , \quad & \mbox{if } v_i > \lambda\alpha\delta
u_i \\
0\ , \quad & \mbox{otherwise} \\
v_i + \lambda\alpha\delta u_i\ , \quad & \mbox{if } v_i < - \lambda\alpha\delta u_i
\end{array}
\right\}
\end{equation}
where
\begin{equation}
\vec{u} = \vec{v} - \delta\,\nabla f(\vec{v})\ .
\end{equation}
\textbf{Active set method} is used in our implementation for FISTA to speed up the
computation. Considerable speedup is obtained by organizing the iterations
around the active set of features - those with nonzero coefficients. After a
complete cycle through all the variables, we iterate on only the active set till
convergence. If another complete cycle does not change the active set, we are
done, otherwise the process is repeated.
\textbf{Warm-up method} is also used to speed up the computation. When the option
parameter $warmup$ is set to be $True$, a series of lambda values, which is
strictly descent and ends at the lambda value that the user wants to calculate,
will be used. The larger lambda gives very sparse solution, and the sparse
solution again is used as the initial guess for the next lambda's solution,
which will speed up the computation for the next lambda. For larger data sets,
this can sometimes accelerate the whole computation and might be faster than
computation on only one lambda value.
\textbf{Note:} Our implementation is a little bit different from the original
FISTA. In the original FISTA, during the backtracking procedure, the algorithm
is searching for a non-negative integer $i_k$ and the new step size is $\delta_k
= \delta_{k-1}/\eta^{i_k}$. Thus the step size is non-increasing. Here, we allow
the step size to increase by using $\delta_k = \delta_{k-1}/\eta^{i_k-1}$ so
that larger step sizes can be tried by the algorithm. Tests show that this is
actually faster.
\paragraph{IGD}
The Incremental Gradient Descent (IGD) algorithm is a stochastic algorithm by
its nature. So it is difficult to get sparse solutions. What we implemented is
Stochastic Mirror Descent Algorithm made Sparse (SMIDAS). The inverse p-form
link function is used
\begin{equation} \label{eq:f}
h_j^{-1}(\vec{\theta}) = \frac{\mbox{sign}(\theta_j)\vert \theta_j
\vert^{p-1}}{\|\theta\|_p^{p-2}}\ ,
\end{equation}
where
\begin{equation}
\|\theta\|_p = \left(\sum_j \vert \theta \vert ^p\right)^{1/p}\ ,
\end{equation}
and $\mbox{sign}(0) = 0$.
\vspace{0.2in}
\hrule
\vspace{0.2in}
Choose step size $\delta > 0$.
Let $p = 2\log d$ and let $h^{-1}$ be as in Eq. (\ref{eq:f})
Let $\vec{\theta} = \vec{0}$
\textbf{for} $k = 1,2,\dots$
\quad $\vec{w} = h^{-1}(\vec{\theta})$
\quad $\vec{v} = \nabla f(\vec{w})$
\quad $\tilde{\vec{\theta}} = \theta - \delta \, \vec{v}$
\quad $\forall j, \theta_j = \mbox{sign}(\tilde{\theta}_j)\max (0,
\vert \tilde{\theta}_j \vert- \lambda\alpha\delta)$
\vspace{0.2in}
\hrule
\vspace{0.2in}
The resulting fitting coefficients of this algorithm is not really
sparse, but the values are very small (usually $< 10^{15}$), which can
be safely set to be zero after filtering with a threshold.
This is done as the following: (1) multiply each coefficient with the
standard deviation of the corresponding feature (2) compute the
average of absolute values of re-scaled coefficients (3) divide each
rescaled coefficients with the average, and if the resulting absolute
value is smaller than \emph{threshold}, set the original coefficient
to be zero.
IGD is in nature a sequential algorithm, and when running in a
distributed way, each segment of the data runs its own SGD model, and
the models are averaged to get a model for each iteration. This
average might slow down the convergence speed, although we acquire the
ability to process large data set on multiple machines. So this
algorithm provides an option \emph{parallel} to let the user choose
whether to do parallel computation.
IGD also implements the \textbf{warm-up method}.
\textbf{Stopping Criteria} Both \textbf{FISTA} and \textbf{IGD} compute the average difference
between the coefficients of two consecutive iterations, and if the
difference is smaller than \emph{tolerance} or the iteration number
is larger than \emph{max\_iter}, the computation stops.
The resulting fitting coefficients of this algorithm is not really sparse, but
the values are very small (usually $< 10^{15}$), which can be safely set to be
zero after filtering with a threshold.
This is done as the following: (1) multiply each coefficient with the standard
deviation of the corresponding feature (2) compute the average of absolute
values of re-scaled coefficients (3) divide each rescaled coefficients with the
average, and if the resulting absolute value is smaller than <em>threshold</em>,
set the original coefficient to be zero.
IGD is in nature a sequential algorithm, and when running in a distributed way,
each segment of the data runs its own SGD model, and the models are averaged to
get a model for each iteration. This average might slow down the convergence
speed, although we acquire the ability to process large data set on multiple
machines. So this algorithm provides an option <em>parallel</em> to let the user
choose whether to do parallel computation.
IGD also implements the \textbf{warm-up method}.
\textbf{Stopping Criteria} Both \textbf{FISTA} and \textbf{IGD} compute the average
difference between the coefficients of two consecutive iterations, and if the
difference is smaller than <em>tolerance</em> or the iteration number is larger
than <em>max\_iter</em>, the computation stops.
We note that the Hessian and derivative both depend on $\beta$, so the variance estimates will also depend on $\beta$. Rather than allow the user to specify a $\beta$ value, the implementation computes the optimal $\beta$ by running the appropriate regression before computing the robust variance. In cases where the regression has parameters (regression tolerance, max iterations), the interface allows the user to specify those parameters.
% subsubsection elastic_net_regularization (end)
% subsection regularization (end)
\section{Robust Variance via Huber-White Sandwich Estimators}
Given $N$ data points, where the $i$th point is defined by a feature vector $x_i \in \mathbb{R}^M$ and a category scalar $y_i$, where $y_i \in \mathbb{R}, y_i \in \{0,1 \}$, and $y_i \in \{1,\dots, L \}$ for linear, logistic, and multi-logistic regression respectively. We assume that $y_i$ is drawn from an independent and identically distributed (i.i.d.) distribution determined by a $K$-dimensional parameter vector $\beta$ (which is $K\times L$ dimensional for multi-logistic regression).
Generally, we are interested in finding the values of $\beta$ that best predict $y_i$ from $x_i$, with \textit{best} being defined as the values that maximize some log-likelihood function $l(y,x,\beta)$. The maximization is typically solved using the derivative of the likelihood $\psi$ and the Hessian $H$. More formally, $\psi$ is defined as
\begin{align}
\psi(y,x, \beta) = \frac{\partial l(x,y,\beta)}{\partial \beta}
\end{align}
and $H$ is defined as
\begin{align}
H(y,x, \beta) = \frac{\partial^2 l(x,y,\beta)}{\partial \beta^2}.
\end{align}
Using these derivatives, one can solve the logistic or linear regression for the optimal solution, and compute the variance and other statistics of the regression.
\subsection{Sandwich Operators}
However, we may believe that the underlying distribution is not i.i.d., (in particular, the variance is not independent of $x_i$), in which case, our variance estimates will be incorrect.
The Huber sandwich estimator is used to get a robust estimate of the variance even if the i.i.d. assumption is wrong. The estimator is known as a sandwich estimator because the robust covariance matrix $S(\beta)$ of $\beta$ can be expressed in a \textit{sandwich formulation}, of the form
\begin{align}
S(\beta) = B(\beta) M(\beta) B(\beta).
\end{align}
The $B(\beta)$ matrix is commonly called the \textit{bread}, whereas the $M(\beta)$ matrix is the \textit{meat}.
\paragraph{The Bread}
Computing $B$ is relatively straightforward,
\begin{align}
B(\beta) = N\left(\sum_i^N -H(y_i, x_i, \beta) \right)^{-1}
\end{align}
\paragraph{The Meat}
There are several choices for the $M$ matrix, each with different robustness properties. In the Huber-White estimator, the matrix $M$ is defined as
\begin{align}
M_{H} =\frac{1}{N} \sum_i^N \psi(y_i,x_i, \beta)^T \psi(y_i,x_i, \beta).
\end{align}
\subsection{Implementation}
The Huber-White sandwich estimators implemented for linear, logistic, and multinomial-logistic regression mimic the same framework as the linear/logistic regression implementations. In these implementations, the gradient and Hessian are computed in parallel, and a final step operates on the aggregate result.
This framework breaks the computation into three steps: \textit{transition} step, the \textit{merge states} step, and the \textit{final} step. The transition step computes the gradient and Hessian contribution from each row in the data matrix. To compute this step, we need to define the derivatives for each regression.
\subsubsection{Linear Regression Derivatives}
For linear regression, the derivative is
\begin{align}
\frac{\partial l(x,y,\beta)}{\partial \beta} = X^T(y - X \beta)
\end{align}
and the Hessian is
\begin{align}
\frac{\partial^2 l(x,y,\beta)}{\partial^2 \beta} = -X^TX.
\end{align}
\subsubsection{Logistic Regression Derivatives}
For logistic, the derivative is
\begin{align}
\frac{\partial l(x,y,\beta)}{\partial \beta} = \sum_{i=1}^N \frac{-1^{y_i}\cdot \beta}{1 + \exp(-1^{y_i} \cdot \beta^Tx)} \exp(-1^{y_i}\cdot \beta^Tx)
\end{align}
and the Hessian is
\begin{align}
\frac{\partial^2 l(x,y,\beta)}{\partial^2 \beta} = -X^TAX = -\sum_{i=1}^N A_{ii} x_i^T x_i
\end{align}
where $A$ is a diagonal matrix with
\begin{align}
A_{ii} = \left( [1 + \exp(c^Tx_i) ) (1 + \exp(-c^Tx_i)] \right)^{-1}.
\end{align}
\subsubsection{Multi-Logistic Regression Derivatives}
For multi-logistic regression, we replace $y_i$ with a vector $Y_i \in \{0,1\}^L$, where all entries of $Y_i$ are zero expect the $y_i$th entry, which is set to 1. In addition, we choose a \textit{baseline} category, for which the odds of all the categories are measured against. Let $J \in \{1, \dots, L \}$ be the baseline category.
We define the variables
\begin{align}
\pi_{i,j} &= \frac{\exp\left(\sum_{k=1}^K x_{i,k} \beta_{k,j} \right)}{1 + \sum_{j\ne J} \exp \left(\sum_{k=1}^K x_{i,k} \beta_{k,j} \right)}, \ \ \forall j \ne J\\
\pi_{i,J} &= \frac{1}{1 + \sum_{j\ne J} \exp \left(\sum_{k=1}^K x_{i,k} \beta_{k,j} \right)}.
\end{align}
The derivatives are then
\begin{equation}\label{eq:first_derivative2}
\frac{\partial l}{\partial \beta_{k,j}} = \sum_{i=1}^{N} Y_{i,j}x_{i,k} - \pi_{i,j}x_{i,k} \ \ \ \ \forall k \ \forall j
\end{equation}
The Hessian is then
\begin{align}\label{eq:second_derivative2}
\frac{\partial^2 l({\beta})}{\partial \beta_{k_2,j_2} \partial \beta_{k_1,j_1}}
&= \sum_{i=1}^{N} -\pi_{i,j_2}x_{i,k_2}(1-\pi_{i,j_1})x_{i,k_1} &&j_1 = j_2 \\
&= \sum_{i=1}^{N} \pi_{i,j_2}x_{i,k_2}\pi_{i,j_1}x_{i,k_1} &&j_1 \neq j_2
\end{align}
\subsubsection{Merge Step and Final Step}
For the logistic and multi-logistic, the derivative and Hessian are sums in
which the terms can be computed in parallel, and thus in a distributed manner.
Each transition step computes a single term in the sum. The merge step sums two
or more terms computed by the transition steps, and the final step computes the
Hessian inverse, and the matrix product between the bread and meat matrices.
\section{Marginal Effects} % (fold)
\label{sub:marginal_effects}
\textit{Most of the notes below are based on~\cite{diekmann2008}}
A \emph{marginal effect} (ME) or partial effect measures the effect on the
conditional mean of $y$ of a change in one of the regressors, say
$x_k$~\cite{cameron2009}. In the linear regression model (without any interaction
terms), the ME equals the relevant slope coefficient, greatly simplifying analysis.
For nonlinear models, this is no longer the case, leading to remarkably many
different methods for calculating MEs.
Let $E(y_i | x_i)$ represent the expected value of a dependent variable $y_i$
given a vector of independent variables $x_i$. In the case where $y$ is a
linear function of $(x_1, \dots, x_M) = \vec{x}$ and $y$ is a continuous variable, a
linear regression model (without any interaction effects) can be stated as follows:
\begin{align*}
& y = \vec{x}^T \vec{\beta} \\
& \text{or} \\
& y = \beta_1 x_1 + \ldots + \beta_M x_M.
\end{align*}
From the above equation it is straightforward to see that the marginal effect of
any variable $x_k$ on the dependent variable is $\partial y / \partial x_k =
\beta_k$. However, this is true only if there exists no interaction between the
variables. If the output expression contains interaction terms, the model would be
\begin{gather*}
y = \beta_1 f_1 + \ldots + \beta_N f_N.
\end{gather*}
where $f_i$ is a function of the base variables $x_1, x_2, \ldots, x_M$ and
describes the interaction between the base variables. In the simple
(non-interaction) case, $f_i = x_i$ and $M = N$.
The standard approach to modeling dichotomous/binary variables
(so $y \in {0, 1}$) is to estimate a generalized linear model under the
assumption that y follows some form of Bernoulli distribution. Thus the expected
value of $y$ becomes,
\begin{equation*}
y = G(\vec{x}^T \vec{\beta}),
\end{equation*}
where G is the specified binomial distribution. Here we assume to use
logistic regression and use $g$ to refer to the inverse logit function. The same
approach is applied when $y$ is a discrete, multi-valued variable.
\subsection{Discrete change effect} % (fold)
\label{sub:discrete_change_effect}
Along with marginal effects we can also compute the following discrete change
effects.
\begin{enumerate}
\item Unit change effect, which should not be confused with the
marginal effect:
\begin{align*}
\frac{\partial y}{\partial x_k} & = P(y=1|X,x_k+1) - P(y=1|X, x_k) \\
& = g( \beta_1 x_1 + \dots + \beta_k (x_k+1) + \beta_l x_l) \\
& \qquad - g( \beta_1 x_1 + \dots + \beta_k x_k + \beta_l x_l)
\end{align*}
\item Centered unit change:
\begin{gather*}
\frac{\partial y}{\partial x_k} = P(y=1|X,x_k+0.5) - P(y=1|X, x_k-0.5) \\
\end{gather*}
\item Standard deviation change:
\begin{gather*}
\frac{\partial y}{\partial x_k} = P(y=1|X,x_k+0.5\delta_k) - P(y=1|X, x_k-0.5\delta_k) \\
\end{gather*}
\item Min-max change:
\begin{gather*}
\frac{\partial y}{\partial x_k} = P(y=1|X,x_k=x_k^{max}) - P(y=1|X, x_k=x_k^{min}) \\
\end{gather*}
\end{enumerate}
% subsection discrete_change_effect (end)
\subsection{Categorical variables} % (fold)
\label{sub:categorical_variables}
Categorical variables are converted to dummy variables to use in the regression.
The regression method treats the dummy variable as a continuous variable and
returns a coefficient for each dummy variable.
To compute marginal effect for each such dummy variable, similar to the
regression, we can treat it as a continuous variable, and
compute the partial derivative of the response variable with respect to this variable.
However, since these variables are discrete, using the partial derivative is inappropriate.
An alternative method is to compute the discrete change for each dummy variable with
respect to the reference level of the categorical variable. If $x_k$ is a dummy
variable corresponding to the value $v$ of a categorical variable and let $x_l$
represent another dummy variable representing value $w$ of the same categorical
variable. The discrete difference with respect to $x_k$ is defined as:
\begin{align*}
\Delta_{x_k} y = y_{k}^{set} - y_{k}^{unset}
\end{align*}
where,
\begin{align*}
y_{k}^{set} &= \beta_1 f_1 + \ldots + \beta_k f_k(x_k=1) + \ldots + \beta_l f_l(x_l=0) + \ldots, \\
& \text{and} \\
y_{k}^{unset} &= \beta_1 f_1 + \ldots + \beta_k f_k(x_k=0) + \ldots + \beta_l f_l(x_l=0) + \ldots.
% \begin{cases}
% 1, & \text{if}\ a=1 \\
% 0, & \text{otherwise}
% \end{cases}
\end{align*}
% \left.y\right\vert_{x_k^{(v)}=1, x_k^{(w)}=0\ (w\neq v)} \\
% & \text{or} \\
% - \left.y\right\vert_{x_k^{(0)}=1, x_k^{(w)}=0 \ (w \neq 0)}\
% &= P_{set} - P_{unset},
If the $y$ expression contains dummy variable ($x_p$) corresponding to the
reference level of the categorical variable (represented as $r$), then that
variable will be unset ($x_p = 0$) in $y^{set}$ and set ($x_p = 1$) in $y^{unset}$.
Note that in many cases, the dummy variable for the reference level does not
appear in the regression model and unsetting the variable for value $w$
when $w\neq r$ is enough in the second term ($y^{unset}$).
In MADlib, we only support the discrete difference method for dummy
variables. Let's walk through the computation of marginal effect for \verb|color_blue| when
the input for the regression is
\begin{verbatim}
array[1, color_blue, color_green, degree_college, degree_college * color_blue,
degree_college * color_green, gpa, gpa^2, degree_college * gpa,
degree_college * gpa^2, weight]
\end{verbatim}
for a specific row of the data. Here \verb|color_blue| and \verb|color_green|
belong to the same categorical variable, with \verb|color_red| being the
reference variable for it (not included in the regressor list).
\begin{enumerate}
\item The value of \verb|color_blue| would be set equal to 1
\item The value of \verb|color_blue| in its interaction terms would also be set to 1 \\
(i.e. \verb|degree_college * color_blue = degree_college|)
\item The value of \verb|color_green| would be set equal to 0
\item The value of \verb|color_green| in its interaction terms would also be set to 0 \\
(i.e. \verb|degree_college * color_green = 0|)
\item Compute resulting predicted values of response variable ($y^{set}$)
\item Set value of \verb|color_blue| equal to 0
\item Set value of \verb|color_blue| in its interaction terms to 0 \\
(i.e. \verb|degree_college * color_blue = 0|)
\item Set value of \verb|color_green| equal to 0 and also in its interaction terms \\
(i.e. \verb|degree_college * color_green = 0|)
\item Compute resulting predicted values of response variable ($y^{unset}$)
\item Compute ($y^{set} - y^{unset}$)
\end{enumerate}
% subsection categorical_variables (end)
\subsection{AME vs MEM} % (fold)
\label{sub:ame_vs_mem}
There are two main methods of calculating the marginal effects for dependent variables.
\begin{enumerate}
\item The first uses the average of the marginal effects at every sample
observation (AME).
% This is calculated as follows:
% \begin{gather*}
% \langle \frac{\partial P(y_i = 1)}{\partial x_k} \rangle = \frac{1}{n}\sum_{i=1}^{n} P(y_i = 1)(1-P(y_i = 1))\cdot\frac{\partial z_i}{\partial x_k}, \\
% \text{where, } P(y_i=1) = \frac{1}{1 + e^{-z_i}}, \\
% \text{and, } z_i = F(X_i)^T\beta \\
% \end{gather*}
\item The second approach calculates the marginal effect for $x_k$ by taking
predicted probability calculated when all regressors are held at their mean
value from the same formulation with the exception of variable under
consideration (MEM).
% The derivation of this marginal effect is captured by the following:
% \begin{gather*}
% \left.\frac{\partial P(y=1)}{\partial
% x_k}\right\vert_{X=\bar{X}} = \quad
% P(y=1|\bar{X})(1-P(y=1|\bar{X}))
% \left.\frac{\partial z}{\partial x_k}\right\vert_{X=\bar{X}} \\
% \text{where, } \bar{X} = \frac{\sum_{i=1}^{n}X_i}{n}
% \end{gather*}
\end{enumerate}
It is generally viewed to be problematic to evaluate
marginal effects at means (MEM) of dummy variables since means of dummies refer
to nonexisting observations. In MADlib, we currently only provide the option to
compute the average marginal effects (AME). In future versions, we aim to
provide various options including marginal effects at means and marginal
effects at a representative value. We currently do
provide the option to compute the margial effect on a separate dataset, which
could be used as a workaround by including just the mean value or the representative value.
% subsection ame_vs_mem (end)
\subsection{Marginal effects for regression methods} % (fold)
\label{sub:marginal_effects_for_regression_methods}
Below we derive the marginal effects for the various regression methods in MADlib.
We assume a general formulation of the output $y$ as
$$
y = g(\beta_1 f_1 + \beta_2 f_2 + \ldots + \beta_{N} f_{N}).
$$
As stated above, each $f_i$ is a function of all the base variables $x_1, x_2, \ldots, x_M$ and
describes the interaction between the base variables. In the simple
(non-interaction) case, $f_i = x_i$.
Let's represent the vector of coefficients as
$$
\vec{\beta} =
\begin{bmatrix}
\beta_1 \\
\vdots \\
\beta_N \\
\end{bmatrix},
$$
and the partial derivative of the $x$ terms in the output
with respect to each variable $x_i$ (Jacobian matrix) as
$$J =
\begin{bmatrix}
\pder[f_1]{x_1}, & \pder[f_1]{x_2}, & \pder[f_1]{x_3}, & \ldots, & \pder[f_1]{x_M}\\[7pt]
\pder[f_2]{x_1}, & \pder[f_2]{x_2}, & \pder[f_2]{x_3}, & \ldots, & \pder[f_2]{x_M}\\[7pt]
\pder[f_3]{x_1}, & \pder[f_3]{x_2}, & \pder[f_3]{x_3}, & \ldots, & \pder[f_3]{x_M}\\[7pt]
& & \vdots & & \\[7pt]
\pder[f_N]{x_1}, & \pder[f_N]{x_2}, & \pder[f_N]{x_3}, & \ldots, & \pder[f_N]{x_M}\\[7pt]
\end{bmatrix}
$$
In the above equation, for each element $J_{n,m}$ the column
corresponds to a base varnable ($x_m$) and the row corresponds to a term in
the output expression ($f_n$).
Let $\nabla f_n$ denote the $n$-th row of $J$ (i.e. gradient of $f_n$),
and let $J(m) = \pder[\vec{f}]{x_m}$ denote the $m$-th column of $J$, where $\vec{f} = (f_1, \ldots f_N)^T$.
If $x_k$ is a categorical variable and if discrete differences are required
for it, then the column corresponding to $x_k$ can be replaced by,
\begin{align*}
\pder[\vec{f}]{x_k} &= \vec{f^{set_k}} - \vec{f^{unset_k}} \\
&= \begin{bmatrix}
f_0^{set_k} , & f_1^{set_k}, & f_2^{set_k} , & \ldots, & f_{N - 1}^{set_k}
\end{bmatrix}^T -
\begin{bmatrix}
f_0^{unset_k} , & f_1^{unset_k}, & f_2^{unset_k} , & \ldots, & f_{N - 1}^{unset_k}
\end{bmatrix}^T \\
&= \begin{bmatrix}
f_0^{set_k} - f_0^{unset_k}, & f_1^{set_k} - f_1^{unset_k}, &
f_2^{set_k} - f_2^{unset_k}, & \ldots, & f_{N - 1}^{set_k} - f_{N - 1}^{unset_k}
\end{bmatrix}^T,
\end{align*}
where
\begin{align*}
& f_i^{set_k} = f_i(\ldots, x_k=1, x_l=0, x_r=0) \\
& \text{and} \\
& f_i^{unset_k} = f_i(\ldots, x_k=0, x_l=0, x_r=1),
\end{align*}
$\forall x_l \in$ (set of dummy variables related to $x_k$ excluding the
reference variable) and $x_r = $ reference variable of $x_k$ (if present). The
response probability corresponding to $f^{set_k}$ is denoted as $P^{set_k}$.
\subsubsection{Linear regression} % (fold)
\label{ssub:linear_regression}
For the linear regression equation of the form $y = \vec{f}^T \vec{\beta}$,
the marginal effect for each variable ($x_i$) is the same as the coefficient for
that variable ($\beta_i$). When interaction effects are present, the marginal effect
will be,
$$
\mathit{ME} = J^T \vec{\beta}
$$
% ssubsection linear_regression (end)
\subsubsection{Logistic regression} % (fold)
\label{ssub:logistic_regression}
In logistic regression:
\begin{align*}
P &= \frac{1}{1 + e^{-\vec{f}^T \vec{\beta}}} \\
&= \frac{1}{1 + e^{-z}}
\end{align*}
\begin{align*}
\implies \frac{\partial P}{\partial X_k} &= P \cdot (1-P) \cdot
\frac{\partial z}{\partial x_k},
\end{align*}
where the partial derivative in the last equation equals to $\beta_k$
if there is no interaction terms. Thus the marginal effect for all variables
presented as a vector will be,
$$
\mathit{ME} = P \cdot (1-P) \cdot J^T \vec{\beta}
$$
For categorical variables, we compute the discrete difference as described in
\ref{sub:categorical_variables}.
For variable $x_k$:
$$
\mathit{ME_k} = P^{set} - P^{unset}
$$
% subsubsection logistic_regression (end)
\subsubsection{Multilogistic regression} % (fold)
\label{ssub:multilogistic_regression}
The probabilities of different outcomes for multilogistic regression are expressed as,
\begin{gather*}
P^l = P(y=l | \vec{x})
= \frac{e^{\vec{f}^T \vec{\beta^l}}}{1 + \displaystyle \sum_{q=1}^{L-1} e^{\vec{f}^T \vec{\beta^q}}}
= \frac{e^{\vec{f}^T \vec{\beta^l}}}{\displaystyle \sum_{q=1}^{L} e^{\vec{f}^T \vec{\beta^q}}},
\end{gather*}
where $\vec{\beta^l}$ represents the coefficient vector for category $l$, with
$L$ being the total number of categories. The coefficients are set to zero for
one of the outcomes, called the ``base outcome'' or the ``reference category''.
Here, without loss of generality, we let $\vec{\beta^L} = \vec{0}$.
% The odds of outcome $j$ verus outcome $m$ are
% \begin{align*}
% \frac{P(y=j | X)}{P(y=m | X)} & = \frac{\frac{e^{X\beta_j}}{\sum_{l=1}^{j} e^{X\beta_l}}}
% {\frac{e^{X\beta_m}}{\sum_{l=1}^{j} e^{X\beta_l}}} \\
% & = \frac{e^{X\beta_j}}{e^{X\beta_m}}
% \end{align*}
% Thus the partial derivative would be,
% \begin{gather*}
% \frac{\partial ln(P_j/P_m)}{\partial x_k} = \beta_{kj} - \beta_{km}
% \end{gather*}
% Thus, if $x_k$ is increased by one unit the odds of outcome $j$ against the
% base outcome changes by exp($\beta_{kj}$) (since $\beta_{km}=0$ for the base
% outcome). Expanding $P_m$ we finally get,
Thus,
\begin{align*}
\displaystyle
\pder[P^l]{x_m} &=
\frac{1}{\displaystyle \sum_{q=1}^{L} e^{\vec{f}^T \vec{\beta^q}}}
\pder[e^{\vec{f}^T \vec{\beta^l}}]{x_m} -
\frac{e^{\vec{f}^T \vec{\beta^l}} }
{\displaystyle \left(\sum_{p=1}^{L} e^{\vec{f}^T \vec{\beta^p}}\right)^2} \cdot
\pder{x_m} \displaystyle \sum_{q=1}^{L} e^{\vec{f}^T \vec{\beta^q}} \\[5pt]
&= \frac{e^{\vec{f}^T \vec{\beta^l}}}
{\displaystyle \sum_{q=1}^{L} e^{\vec{f}^T \vec{\beta^q}}} \pder[\vec{f}^T \vec{\beta^l}]{x_m} -
\frac{ e^{\vec{f}^T \vec{\beta^l}} }
{\displaystyle \sum_{p=1}^{L} e^{\vec{f}^T \vec{\beta^p}} }
\quad \displaystyle
\mathlarger{\sum_{q=1}^{L}}
\frac{ e^{\vec{f}^T \vec{\beta^q}} }
{\displaystyle \sum_{p=1}^{L} e^{\vec{f}^T \vec{\beta^p}}}
\pder[\vec{f}^T \vec{\beta^q}]{x_m} \\[5pt]
\end{align*}
Hence, for every $m$-th variable $x_m$, we have the marginal effect for category
$l$ as,
\begin{gather*}
\mathit{ME}_m^l = P^l\left(\displaystyle J(m)^T \vec{\beta^l} -
\sum_{q=1}^{L-1} P^q \cdot J(m)^T \vec{\beta^q} \right).
\end{gather*}
Vectorizing the above equation, we get
$$
\mathit{ME}^l = P^l \left(\displaystyle J^T \vec{\beta^l} - J^T B \vec{p} \right),
$$
where $\vec{p} = (P^1, \ldots, P^{L-1})^T$ is a column vector and $B = (\vec{\beta}^1, \dots, \vec{\beta}^{L-1})$ is a $N \times (L-1)$ matrix.
Finally, we can simplify the computation of the marginal effects matrix as
$$
\mathit{ME} = J^T B \mathit{diag}(\vec{p}) - J^T B \vec{p} \vec{p}^T.
$$
Once again, for categorical variables, we compute the discrete difference
as described in \ref{sub:categorical_variables}.
For categorical variable $x_k$, the $k$-th row of $\mathit{ME}$ will be,
$$
\mathit{ME_k} = \left( \vec{p^{set_k}} - \vec{p^{unset_k}} \right)^T
$$
% subsubsection multilogistic_regression (end)
% subsection marginal_effects_for_regression_methods (end)
\subsection{Standard Errors} % (fold)
\label{sub:standard_errors}
The delta method is a popular way to estimate standard errors of non-linear
functions of model parameters. While it is straightforward to calculate the
variance of a linear function of a random variable, it is not for a nonlinear
function. The delta method therefore relies on finding a linear approximation
of the function by using a first-order Taylor expansion.
We can approximate a function $g(x)$ about a value $a$ as,
\[ g(x) \approx g(a) + (x-a)g'(a) \]
Taking the variance and setting $a = \mu_x$,
\[ Var(g(X)) \approx \left[g'(\mu_x)\right]^2 Var(X) \]
\subsubsection*{Linear Regression} % (fold)
\label{ssub:std_linear_regression}
Using this technique, to compute the variance of the marginal effects
at a given observation value in \emph{linear regression}, we obtain
the standard error by first computing the marginal effect's derivative
over the coefficients, which is a $M\times N$ matrix $S_{mn}
= \frac{\partial \mathit{ME}_m}{\partial \beta_n}$
\begin{align*}
\vec{S} &= \pder[J^T \vec{\beta}]{\vec{\beta}} \\
&= J^T
\end{align*}
Using the delta method we can then compute the variance of the marginal effects
as,
\begin{align*}
Var(\mathit{ME}) = S \cdot Var(\beta)\cdot S^T\,
\end{align*}
where $Var(\beta)$ is a $N\times N$ matrix, $S$ is a $M\times N$
matrix, $M$ is the number of base variables, and $N$ is the total number of
terms in independent variable list (i.e. length of $\vec{\beta}$).
Note: The $Var(\vec{\beta})$ is computed using the training data
employed by the underlying regression, but not the data used to compute the
marginal effects. The delta matrix (\vec{S}) is computed over the
data for which marginal effects is desired (averaged over the data for AME).
% subsubsection linear_regression (end)
\subsubsection*{Logistic Regression}
Similar to linear regression, we obtain the variance matrix by first computing
the delta matrix, $\vec{S}$:
\begin{eqnarray*}
S_{mn} &=& \frac{\partial}{\partial\beta_n} \left[P (1- P)
\cdot \frac{\partial z}{\partial x_m}\right]\\
&=& P (1- P) \cdot \frac{\partial}{\partial\beta_n}\left(\frac{\partial z}{\partial x_m}\right)
+ \frac{\partial \left[P (1- P)\right]}{\partial\beta_n} \cdot \frac{\partial z}{\partial x_m}\\
&=& P(1-P)\cdot\frac{\partial^2 z}{\partial x_m\partial\beta_n}
+ P(1-P)(1-2P) \cdot \frac{\partial z}{\partial \beta_n} \cdot \frac{\partial z}{\partial x_m},\\
\text{where } P &=& \frac{1}{1 + e^{-z}} \\
\text{and } z &=& \vec{f}^T \vec{\beta}. \\
\end{eqnarray*}
Using the definition of $z$, we can simplify $S$ a little bit
\begin{equation*}
S_{mn} = P(1-P)\left(\frac{\partial f_n}{\partial x_m} + (1-2P)\cdot f_n \cdot J(m)^T \vec{\beta} \right)
\end{equation*}
Thus, $n$-th column of $S$
$$
c_n(S) = P(1-P)\left[\nabla f_n^T + (1-2P) \cdot f_n \cdot J^T \vec{\beta}\right].
$$
Vectorizing this equation to express for the complete matrix,
\begin{equation*}
\vec{S} = P(1-P) \left(J^T +
(1-2P) \cdot (J^T \vec{\beta}) \vec{f}^T \right)
\end{equation*}
And for categorical variables, we replace $P(1-P)\cdot(\partial z/\partial x_m)$
with $\Delta_{x_m}P$ in $S_{mn}$, we get
\begin{eqnarray*}
S_{mn} &=& \frac{\partial(P^{set}-P^{unset})}{\partial\beta_n} \\
&=& P^{set} (1 - P^{set}) \cdot f_n^{set} - P^{unset} (1 - P^{unset}) \cdot f_n^{unset}
\end{eqnarray*}
Similar to linear regression, we can then compute the variance of the marginal effects as,
\begin{align*}
Var(\mathit{ME}) = S \cdot Var(\vec{\beta})\cdot S^T\,
\end{align*}
where $Var(\vec{\beta})$ is computed on the original training dataset, while
\vec{S} is averaged over the dataset on which marginal effect is desired
(could be just a single datapoint).
\subsubsection*{Multinomial Logistic Regression}
For multinomial logistic regression, the coefficients $\vec{\beta}$ form a matrix of
dimension $N \times (L - 1)$ where $L$ is the number of categories and $N$ is the
number of features (including interaction terms). In order to compute the
standard errors on the marginal effects of category $l$ for independent variable
$x_m$, we need to compute the term $\pder[\mathit{ME}_{m}^{l}]{\beta_{n}^{l'}}$
for each $l' \in \{1 \ldots (L-1) \}$ and $n \in \{1 \ldots N \}$. Note that
$m$ here is restricted to be in the set $\{1 \ldots M\}$.
The result is a column vector of length $(L-1) \times N$ denoted by
$\pder[\mathit{ME}_{m}^{l}]{\vec{\beta}}$. Hence, for each category
$l \in \{1 \ldots (L-1)\}$ and independent variable $m \in \{1 \ldots M\}$, we
perform the following computation
\begin{equation}
Var(\mathit{ME}_m^l) = \pder[\mathit{ME}_m^l]{\vec{\beta}}^T V \pder[\mathit{ME}_m^l]{\vec{\beta}},
\end{equation}
where $V$ is the variance-covariance matrix of the multinomial logistic
regression vectorized in the \textbf{same order} in which the partial derivative
is vectorized.
From our earlier derivation, we know that the marginal effect for multinomial
logistic regression for the $m$-th index of data vector $\vec{x}$ is given as:
\begin{gather*}
\mathit{ME}_m^l = P^l \left[ J(m)^T \vec{\beta^l} -
\sum_{q=1}^{L-1} P^q J(m)^T \vec{\beta^q} \right]
\end{gather*}
where
\begin{gather*}
P^l = P(y=l| \vec{x}) = \frac{e^{\vec{f}\vec{\beta^l}}}{\displaystyle \sum_{q=1}^{L} e^{\vec{f}\vec{\beta^q}}}
\qquad \forall l \in \{ 1 \ldots (L-1) \}.
\end{gather*}
We now compute the term $\pder[\mathit{ME}_m^l]{\vec{\beta}}$. First,
we define the indicator function, $\delta$, as:
\begin{equation*}
\delta_{i,j} = \begin{cases} 1 & \mbox{if } i=j \\
0 & \mbox{otherwise} \end{cases}.
\end{equation*}
We can show that for each $l' \in \{ 1 \ldots (L-1) \}$ and $n \in \{1 \ldots N\}$,
the partial derivative will be
\begin{align*}
\pder[\mathit{ME}_m^l]{\beta_n^{l'}} &=
\pder[P^l]{\beta_n^{l'}}
\left[ J(m)^T \vec{\beta^l} -
\sum_{q=1}^{L-1} P^q J(m)^T \vec{\beta^q}
\right] +
P^l \left[ \pder[]{\beta_n^{l'}}(J(m)^T \vec{\beta^l}) -
\pder[]{\beta_n^{l'}} \left( \sum_{q=1}^{L-1} P^q J(m)^T \beta^q \right)
\right]
\end{align*}
where
\begin{align*}
\pder[P^l]{\beta_n^{l'}} &= P^l f_n (\delta_{l,l'} - P^{l'})
\end{align*}
The expression above can be simplified to obtain
\begin{align*}
\pder[\mathit{ME}_m^l]{\beta_n^{l'}} &=
P^l f_n (\delta_{l,l'} - P^{l'})
\left[ J(m)^T \vec{\beta^l} -
\sum_{q=1}^{L-1} P^q J(m)^T \vec{\beta^q}
\right] + \\
& \qquad P^l \left[ \delta_{l,l'} \pder[f_n]{x_m} - P^{l'} f_n J(m)^T \vec{\beta^{l'}} +
P^{l'} f_n \sum_{q=1}^{L-1} P^q J(m)^T \vec{\beta^q} -
P^{l'} \pder[f_n]{x_m}
\right] \\
&= f_n (\delta_{l,l'} - P^{l'}) \cdot P^l \left[J(m)^T\vec{\beta^l} -
\sum_{q=1}^{L-1} P^q J(m)^T \vec{\beta^q} \right] + \\
& \qquad P^l \left[\delta_{l,l'} \pder[f_n]{x_m} -
f_n \cdot P^{l'}
\left( J(m)^T \vec{\beta^{l'}} - \sum_{q=1}^{L-1} P^q J(m)^T \vec{\beta^q} \right)
- P^{l'} \pder[f_n]{x_m}
\right] \\[7pt]
&= f_n (\delta_{l, l'} - P^{l'})\mathit{ME}_m^l +
P^l \left[ \delta_{l, l'} \pder[f_n]{x_m} - f_n \mathit{ME}_m^{l'} - P^{l'} \pder[f_n]{x_m} \right].
\end{align*}
Again, the above computation is performed for every $l \in \{1 \ldots (L-1)\}$ (base outcome is skipped)
and every $m \in \{1 \ldots M\}$, with each computation returning a column vector
of size $(L-1) \times N$.
For categorical variables, we use the discrete difference value of $\mathit{ME}_m^l$
to compute the standard error as,
\begin{eqnarray*}
\pder[\mathit{ME}_m^l]{\beta_n^{l'}} &=&
\pder[P^{set_m, l}]{\beta_n^{l'}} - \pder[P^{unset_m, l}]{\beta_n^{l'}} \\[7pt]
&=& P^{set_m, l} f_n^{set_m} (\delta_{l,l'} - P^{set_m, l'}) -
P^{unset_m, l} f_n^{unset_m} (\delta_{l,l'} - P^{unset_m, l'}) \\[7pt]
&=& - \left(P^{set_m, l} f_n^{set_m} P^{set_m, l'} -
P^{unset_m, l} f_n^{unset_m} P^{unset_m, l'} \right) + \\
&& \qquad \delta_{l,l'} \left(P^{set_m, l} f_n^{set_m} - P^{unset_m, l} f_n^{unset_m} \right),
\end{eqnarray*}
where
\begin{align*}
& P^{set_m, l} = \frac{e^{\vec{f^{set_m}}\vec{\beta^l}}}{\displaystyle \sum_{q=1}^{L} e^{\vec{f^{set_m}}\vec{\beta^q}}} \\
& \text{and} \\
& P^{unset_m, l} = \frac{e^{\vec{f^{unset_m}}\vec{\beta^l}}}{\displaystyle \sum_{q=1}^{L} e^{\vec{f^{unset_m}}\vec{\beta^q}}} \qquad \qquad \forall l \in \{ 1 \ldots (L-1) \}.
\end{align*}
% subsection standard_errors (end)
% section marginal_effects (end)
\section{Clustered Standard Errors} % (fold)
\label{sec:clustered_standard_errors}
Adjusting standard errors for clustering can be important. For
example, replicating a dataset 100 times should not increase the
precision of parameter estimates. However, performing this procedure
with the IID assumption will actually do this. Another example is in
economics of education research, it is reasonable to expect that the
error terms for children in the same class are not
independent. Clustering standard errors can correct for this.
\subsection{Overview of Clustered Standard Errors}
Assume that the data can be separated into $m$ clusters. Usually this
can be down by grouping the data table according to one or multiple
columns.
The estimator has a similar form to the usual sandwich estimator
\begin{equation}
S(\vec{\beta}) = B(\vec{\beta}) M(\vec{\beta}) B(\vec{\beta})
\end{equation}
The bread part is the same as sandwich estimator
\begin{eqnarray}
B(\vec{\beta}) & = & \left(-\sum_{i=1}^{n} H(y_i, \vec{x}_i,
\vec{\beta})\right)^{-1}\\
& = & \left(-\sum_{i=1}^{n}\frac{\partial^2 l(y_i, \vec{x}_i,
\vec{\beta})}{\partial \beta_\alpha \partial \beta_\beta}\right)^{-1}
\end{eqnarray}
where $H$ is the hessian matrix, which is the second derivative of the
target function
\begin{equation}
L(\vec{\beta}) = \sum_{i=1}^n l(y_i, \vec{x}_i, \vec{\beta})\ .
\end{equation}
The meat part is different
\begin{equation}
M(\vec{\beta}) = {A}^T{A}
\end{equation}
where the $m$-th row of ${A}$ is
\begin{equation}
A_m = \sum_{i\in G_m}\frac{\partial
l(y_i,\vec{x}_i,\vec{\beta})}{\partial \vec{\beta}}
\end{equation}
where $G_m$ is the set of rows that belong to the same cluster.
\subsection{Implementation}
We can compute the quantities of $B$ and $A$ for each cluster during one scan
through the data table in an aggregate function. Then sum over all clusters to
the full $B$ and $A$ in the outside of the aggregate function. At last, the
matrix mulplitications are done in a separate function on master.
% section clustered_standard_errors (end)