doc/design/modules/glm.tex - madlib - Git at Google

 % When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
 % !TEX root = ../design.tex

 \chapter[Generalized Linear Models]{Generalized Linear Models}

 \begin{moduleinfo}
 \item[Author] \href{mailto:lpei@gopivotal.com}{Liquan Pei}
 \item[Author] \href{mailto:lhuang@pivotal.io}{Lei Huang}
 \item[History]
     \begin{modulehistory}
         \item[v0.1] Initial version
         \item[v0.2] Extention to multivariate response and ordinal response case
     \end{modulehistory}
 \end{moduleinfo}

 % Abstract. What is the problem we want to solve?

 \section{Introduction}

 Linear regression model assumes that the dependent variable $Y$ is equal to a linear combination $\vec{X}^\top \vec{\beta}$ and a normally distributed error term
 \begin{align*}
 Y = \vec{X}^\top \vec{\beta} + \epsilon
 \end{align*}
 where $\vec{\beta} = (\beta_{1}, \dots, \beta_{m})^\top$ is a vector of unknown parameters and $\vec{X} = (X_1, \dots, X_{m})^\top$ is a vector of independent variables.

 In a generalized linear model (GLM), the distribution of dependent variable $Y$ is a member from the \emph{exponential family} and the mean $\mu = \mathbf{E}(Y)$ depends on the independent variables $\vec{X}$ through
 \begin{align*}
 \mu = \mathbf{E}(Y) = g^{-1}(\eta) =g^{-1}(\vec{X}^\top \beta)
 \end{align*}
 where $\eta = \vec{X}^\top \beta$ is the \emph{linear predictor} and $g$ is the \emph{link function}.

 In what follows, we denote $G(\eta) = g^{-1}(\eta)$ as the inverse link function.

 \subsection{Exponential Family}
 A random variable $Y$ is a member from the exponantial family if its probability function or its density function has the form
 \begin{align*}
 f(y,\theta, \psi) = \exp \left\{ \frac{y\theta - b(\theta)}{a(\psi)} + c(y, \psi)\right\}
 \end{align*}
 where $\theta$ is the canonical parameter.
 The mean and variance of the exponential family are
 \begin{itemize}
 \item $\mathbf{E}(Y) = \mu = b'(\theta)$
 \item $\mathbf{Var}(Y) = V(\mu)a(\psi) = b''(\theta)a(\psi)$
 \end{itemize}

 \subsection{Linear Predictor}
 The linear predictor $\eta$ incorporates the information about the independent variables into the model. It is related to the expected value of the data through link functions.

 \subsection{Link Function}
 The link function provides the relationship between $\eta$, the linear predictor and $\mu$, the mean of the distribution function. There are many commonly used link functions, and their choice can be somewhat arbitrary. It makes sense to try to match the domain of the link function to the range of the distribution function's mean.

 For canonical parameter $\theta$, the canonical link function is the function that expresses $\theta$ in terms of $\eta = g(\mu)$. In what follows we treat $\theta = \theta(\eta) = h(g(\mu))$. If we choose $h$ to be an identical function, then $\theta = \eta$
 and $\mu = G(\eta) = \mu(\theta)$.

 \section{Parameter Estimation}
 We estimate unknown parameters $\beta$ by maximizing the log-likelihood of a GLM. Given the examples $\vec{Y} = (Y_1, \dots, Y_n)^\top$ and denote their mean as $\vec{\mu} = (\mu_1, \dots, \mu_{n})^\top$, the log-likelihood for a GLM is

 \begin{align*}
 l(\vec{Y}, \vec{\mu}, \psi) & =\sum_{i=1}^{n} \log f(Y_i, \theta_{i}, \psi) \\
 & =\sum_{i=1}^{n}\left\{ \frac{Y_i \theta_{i} - b\left(\theta_{i}\right)}{a\left(\psi\right)} - c(Y_i, \psi)\right\}
 \end{align*}
 where $\theta_{i} = \theta(\eta_{i}) = \theta(x_i^\top \beta)$

 Note that $a(\psi)$ and $c(Y_i, \psi)$ dose not depend on $\beta$, we then maximize
 \begin{align}
 \label{likelihood}
 \tilde{l}(\vec{Y}, \vec{\mu}) = \sum_{i = 1}^{n} \left\{ Y_i \theta_{i} - b(\theta_{i}) \right\}
 \end{align}
 with respect to $\vec{\beta}$.
 In what follows, we denote $\vec{x_i}$ to be the vector of values of independent variables for $Y_i$.
 \subsection{Iterative Reweighted Least Squares Algorithm}
 We use iterative reweighted least squares (IRLS) algorithm to find $\vec{\beta}$ that maximize $\tilde{l}(\vec{Y},\vec{\mu})$. Specifically, we use Fisher scoring algorithm which updates $\vec{\beta}$ at step $k+1$ using
 \begin{align*}
 \vec{\beta}^{k+1} = \vec{\beta}^{k} + \left\{\mathbf{E}[H(\beta^{k})]\right\}^{-1} \nabla_{\vec{\beta}}\tilde{l}(\vec{\beta}^{k})
 \end{align*}
 where $\mathbf{E}[H]$ is the mean of Hessian over examples $\vec{Y}$ and $\nabla_{\vec{\beta}}\tilde{l}$ is the gradient.
 For GLM, the gradient is
 \begin{align*}
 \nabla_{\vec{\beta}}\tilde{l} &= \sum_{i = 1}^{n}\left\{Y_i - b'(\theta_{i})\right\}\nabla_{\vec{\beta}}\theta_{i} \\
 \end{align*}
 Note that $\mu_{i} = G(\eta_{i}) = G(\vec{x_i}^\top \vec{\beta}) = b'(\theta_{i})$, we have
 \begin{align*}
 \nabla_{\vec{\beta}}\theta_{i} &= \frac{G'(\eta_{i})}{V(\mu_{i})} \vec{x_i}
 \end{align*}
 %and
 %\begin{align*}
 %\frac{\partial^2 \theta_{i}}{\partial \beta \partial \beta^{T}} = \frac{G''(\eta_{i})V(\mu_{i}) - G'(\eta_{i})^2 V'(\mu_i)}{V(\mu_{i})^2}x_i x_i^T
 %\end{align*}
 then
 \begin{align*}
 \nabla_{\vec{\beta}}\tilde{l} &= \sum_{i = 1}^{n}\left\{Y_i - \mu_{i}\right\}\frac{G'(\eta_{i})}{V(\mu_{i})}\vec{x_i}
 \end{align*}
 The Hessian is
 \begin{align*}
 H(\beta) &= \sum_{i = 1}^{n} \left\{-b''(\theta_{i}) \nabla_{\vec{\beta}}\theta_{i} {\nabla_{\vec{\beta}}\theta_{i}}^\top - \left\{Y_i - b'(\theta_{i})\right\}\nabla_{\vec{\beta}}^2 \theta_{i}\right\} \\
 & = \sum_{i = 1}^{n} \left\{ \frac{G'(\eta_{i})^2}{V(\mu_{i})} - \left\{Y_i - \mu_{i}\right\}\nabla_{\vec{\beta}}^2 \theta_{i}\right\}\vec{x_i} \vec{x_i}^\top
 \end{align*}
 Note that $\mathbf{E}[Y_i] = \mu_{i}$, we have
 \begin{align*}
 \mathbf{E}[H(\beta)] = \sum_{i = 1}^{n} \left\{\frac{G'(\eta_{i})^2}{V(\mu_{i})}\right\}\vec{x_i} \vec{x_i}^\top
 \end{align*}
 Define the weight matrix
 \begin{align*}
 \vec{W} = \text{diag}\left( \frac{G'(\eta_{1})^2}{V(\mu_{1})}, \dots, \frac{G'(\eta_{n})^2}{V(\mu_{n})} \right)
 \end{align*}
 and define
 \begin{align*}
 \vec{\tilde{Y}} = \left(\frac{Y_1 - \mu_1}{G'(\eta_{1})}, \dots, \frac{Y_n - \mu_{n}}{G'(\eta_{n})}\right)^\top
 \end{align*}
 and the design matrix
 \begin{align*}
 \vec{X}^\top = \left(\vec{x_1}, \dots, \vec{x_n}\right)
 \end{align*}
 Finally, the update rule for GLM is
 \begin{align*}
 \vec{\beta}^{k+1}  &= \vec{\beta}^{k} + (\vec{X}^\top \vec{W} \vec{X})^{-1}\vec{X}^\top \vec{W}\tilde{\vec{Y}} \\
 &= (\vec{X}^\top \vec{W} \vec{X})^{-1}\vec{X^\top WZ}
 \end{align*}
 where  $\vec{Z} = (Z_1, \dots, Z_n)$ is a vector of \emph{adjusted dependent variables}
 \begin{align*}
 Z_i = \vec{x_i}^\top \vec{\beta}^{k} + \frac{Y_i - \mu_{i}}{G'(\eta_{i})}
 \end{align*}
 Note that each step is the result of a weighted least square regression on the adjusted variables $Z_i$ on $x_i$ and this the reason that this algorithm is called iterative reweighted least squares.

 The IRLS algorithm for GLM is as follows
 \begin{algorithm}
 \alginput{$\vec{X}$, $\vec{Y}$, inverse link function $G(\eta)$, dispersion function $V(\mu)$ and initial values $\vec{\beta}^0$}
 \algoutput{$\vec{\beta}$ that maximize $\tilde{l}(\vec{Y}, \vec{\mu})$}
 \begin{algorithmic}[1]
     \State $k \leftarrow 0$
     \Repeat
         \State Compute $\vec{\mu}$ where $\mu_{i} = G(\eta_{i}) = G(\vec{x_i}^\top \vec{\beta}^{k})$
         \State Compute $\vec{Z}$ where $Z_i = \vec{x_i}^\top \vec{\beta}^{k} + \frac{Y_i - \mu_{i}}{G'(\eta_{i})}$
         \State Compute $\vec{W}$ where $W_{ii} = \frac{G'(\eta_{i})^2}{V(\mu_{i})}$
         \State $\vec{\beta}^{k+1} = \vec{(X^\top W X)}^{-1} \vec{X^\top WZ}$
     \Until{$\vec{\beta}^{k+1}$ converges}
 \end{algorithmic}
 \label{alg:IRLS}
 \end{algorithm}

 \subsection{Functions for contructing the exponential families}
 Table \ref{tab:glm_func} \cite{fox2008applied} provides functions $a(), b()$ and $c()$ to contruction the exponential families,

 \begin{table}[h]
 \centering
 \begin{tabular}{cccc}
 Family & $a(\psi)$ & $b(\theta)$ & $\c(y, \psi)$ \\
 \hline
 Gaussian & $\psi$ & $\theta^2/2$ & $-\frac{1}{2}\left[y^2/\psi+\log_e(2\pi\psi)\right]$ \\
 Binomial & 1/n & $\log_e(1+e^\theta)$ & $\log_eC^n_{ny}$ \\
 Poisson & 1 & $e^\theta$ & $-log_ey!$ \\
 Gamma & $\psi$ & $-\log_e(-\theta)$ & $\psi^{-1}\log_e(y/\psi) - \log_ey - \log_e\Gamma(\psi^{-1})$ \\
 Inverse-Gaussian & $-\psi$ & $\sqrt{2\theta}$ & $-\frac{1}{2}\left[\log_e(\pi\psi y^3)^3 + 1/(\psi y) \right]$ \\
 \end{tabular}
 \caption{Functions for constructing the exponential families}
 \label{tab:glm_func}
 \end{table}

 \subsection{Multivariate response family}
 \label{subsec::multinomreg}
 Instead of a single scalar number, some response variable follows a multivariate distribution (i.e. the response variable is a vector instead of a scalar number). One example is multinomial GLM where the response variable is an indictor vector containing zeros and ones to dummy code the corresponding categories. For illustration purpose, in this section, we are discussing multinomial GLM. However, other distributions can be easily extended.

 Let $J$ denote the number of categories, $y_i=(y_{i1}, y_{i2}, ..., y_{i(J-1)})^T$ be the indicator vector for $i$th subject where each $y_{ij}$ is the binary indicator whether subject $i$ is in categories $j$, $\mu_{ij}$ will be the probabilty subject $i$ is in category $j$. Therefore, we can have the log likelihood as below,

 \begin{align*}
 l & = \sum_{i=1}^{I} \left(\sum_{j=1}^{J-1} y_{ij} \log{\frac{\mu_{ij}}{1-\sum_{j=1}^{J}\mu_{ij}}} + \log(1-\sum_{j=1}^{J-1}) \right)\\
  & = \sum_{i=1}^I \left( \sum_{j=1}^{J-1} y_{ij} \theta_{ij} - \log(1+\sum_{j=1}^{J-1}\exp\theta_{ij}) \right)
 \end{align*}

 Define $b(\theta_i) = \log(1+\sum_{j=1}^{J-1}\exp\theta_{ij})$, then it can be showed $\triangledown b(\theta_i) = \mu_i$ and
 \[
 \triangledown \triangledown^T b(\theta_i) = \left( \begin{array}{cccc}
 \mu_{i1}(1-\mu_{i1}) & -\mu_{i1}\mu_{i2} & ... & -\mu_{i1}\mu_{i(J-1)} \\
 -\mu_{i2}\mu_{i1} & \mu_{i2}(1-\mu_{i2}) & ... & -\mu_{i2}\mu_{i(J-1)} \\
 \vdots & \vdots & \vdots & \vdots \\
 -\mu_{i(J-1)}\mu_{i1} & -\mu_{i(J-1)}\mu_{i2} & ... & \mu_{i(J-1)}\mu_{i(J-1)}
 \end{array} \right)
 \]
 We set $V = \triangledown \triangledown^T b(\theta_i)$

 Let $\eta_{ij} = g_j(\mu_i)$ and $g() = (g_1(), g_2(), ..., g_{J-1}())^T$ be the link map, which is $\Re^{J-1}$ to $\Re^{J-1}$. Also we have $\mu_i = G(\eta_i)$ be its inverse map. We define the derivative of $G$ to be $G^\prime = \left(\frac{\partial\mu_i}{\partial\eta_{i1}}, \frac{\partial\mu_i}{\partial\eta_{i2}}, ..., \frac{\partial\mu_i}{\partial\eta_{i(J-1)}} \right)$. For example, in multinomial logistic regression, $\eta_{ij} = \theta_{ij}$, then $G^\prime = V$.

 Denote the coefficient to be $\beta_{kj}$ where $k$ stands for the $k$th predictor and $j$ stands for the $j$th category. Then we have
 \begin{align*}
 \frac{\partial l_i}{\partial \beta_{kj}} & = (y_i - \triangledown b(\theta_i) )^T \frac{\partial \theta_i}{\partial \mu_i} \frac{\partial \mu_i}{\partial \eta_{ij}} \frac{\partial \eta_{ij}}{\partial \beta_{kj}} \\
 & = (y_i - \mu_i )^T V^{-1} G^\prime_j x_{ik}
 \end{align*}
 where $G^\prime_j$ is the $j$th column of $G^\prime$.
 \begin{align*}
 \frac{\partial^2 l_i}{\partial\beta_{kj} \partial\beta_{lh}} &= -x_{il} (G^\prime_h)^T V^{-1} \triangledown \triangledown^T b(\theta_i) V^{-1} G^\prime_j x_{ik} \\
 & = -x_{il} (G^\prime_h)^T V^{-1} G^\prime_j x_{ik}
 \end{align*}

 As a entire vector $\beta$,
 \begin{align*}
 \triangledown_\beta l_i &= \left((y_i - \mu_i)^T V^{-1} G^\prime(\mu_i) \right)^T \otimes X_i \\
 E\left[\triangledown \triangledown^T_\beta l_i\right] & = - \left( [G^\prime(\mu_i)]^T V^{-1} [G^\prime(\mu_i)] \right) \otimes (X_i X_i^T)
 \end{align*}
 where $X_i = (x_{i1}, x_{i2}, ... , x_{ip})^T$.

 Finally, we can use the below update equation for Newton-Raphson method,
 \[
 \beta^{k+1} = \beta^k + \left\{ \sum_{i=1}^I\left([G^\prime(\mu_i)]^T V^{-1} [G^\prime(\mu_i)] \right) \otimes (X_i X_i^T)\right\}^{-1}\sum_{i=1}^I\left\{\left((y_i - \mu_i)^T V^{-1} G^\prime(\mu_i) \right)^T \otimes X_i\right\}
 \]

 \subsection{Ordinal logistic regression}
 In statistics, the ordered logit model (also ordered logistic regression or proportional odds model), is a regression model for ordinal dependent variables. For example, if one question on a survey is to be answered by a choice among "poor", "fair", "good", "very good", and "excellent", and the purpose of the analysis is to see how well that response can be predicted by the responses to other questions, some of which may be quantitative, then ordered logistic regression may be used. It can be thought of as an extension of the logistic regression model that applies to dichotomous dependent variables, allowing for more than two (ordered) response categories.

 The model we implement here only applies to data that meet the proportional odds assumption, the meaning of which can be exemplified as follows.

 \[
 \log\left( \frac{\Pr(Y_i \le j)}{1-\Pr(Y_i \le j)} \right) = \alpha_j - \beta_1 x_{i1} - \beta_2 x_{i2} - ... - \beta_p x_{ip}
 \]
 where $\Pr(Y_i \le j)$ is the cumulative probability that the $i$th subject belongs to the first $j$th categories; $\alpha_j$ is category-specific intercept and $\beta_k$ is feature-specific coefficient. Using the notation in the subsection \ref{subsec::multinomreg}, the link function is
 \[
 g\left([\mu_1, \mu_2, ..., \mu_{J-1}]^T\right) = \left[\log\left(\frac{\mu_1}{1-\mu_1}\right), \log\left(\frac{\mu_1+\mu_2}{1-\mu_1-\mu_2}\right), ..., \log\left(\frac{\mu_1+\mu_2+...+\mu_{J-1}}{1-\mu_1-\mu_2-...-\mu_{J-1}}\right)\right]^T
 \]
 Then the inverse of link function G is,
 \begin{align*}
 G\left([\eta_1, \eta_2, ..., \eta_{J-1}]^T\right) & = \left[\frac{\exp(\eta_1)}{1+\exp(\eta_1)}, \frac{\exp(\eta_2)}{1+\exp(\eta_2)} - \frac{\exp(\eta_1)}{1+\exp(\eta_1)},\right. \\
 						  &    \left. ..., \frac{\exp(\eta_{J-1})}{1+\exp(\eta_{J-1})} - \frac{\exp(\eta_{J-2})}{1+\exp(\eta_{J-2})}\right]^T
 \end{align*}
 Its derivative matrix $G^\prime$ is
 \begin{align*}
 G^\prime & = \left[\frac{\partial \mu}{\partial \eta_1}, \frac{\partial \mu}{\partial \eta_2}, ..., \frac{\partial \mu}{\partial \eta_{J-1}}\right] \\
 & = \left[ \begin{array}{ccccc}
 	      \frac{\exp(\eta_1)}{(1+\exp(\eta_1))^2} & 0 & 0 & ... & 0 \\
 	     -\frac{\exp(\eta_1)}{(1+\exp(\eta_1))^2} & \frac{\exp(\eta_2)}{(1+\exp(\eta_2))^2} & 0 & ... & 0 \\
 	     0 & -\frac{\exp(\eta_2)}{(1+\exp(\eta_2))^2} & \frac{\exp(\eta_3)}{(1+\exp(\eta_3))^2} & ... & 0 \\
 	     \vdots & \vdots & \vdots & ... & \vdots \\
 	     0 & 0 & 0 & ... & \frac{\exp(\eta_{J-1})}{(1+\exp(\eta_{J-1}))^2}
 	   \end{array} \right]
 \end{align*}

 Define $h_i = \sum_{j=1}^{J-1} \frac{\partial \mu_i}{\partial \eta_{ij}}$, then as a entire coefficient vector $\gamma = [\alpha_1, \alpha_2, ..., \alpha_{J-1}, \beta_1, \beta_2, ..., \beta_p]^T$,
 \begin{align*}
 \triangledown_\gamma l_i &= \left[ \begin{array}{c}
 					\left\{(y_i - \mu_i)^T V^{-1} G_i^{\prime} \right\}^T \\
 					-(y_i - \mu_i)^T V^{-1} h_i X_i
 				   \end{array} \right] \\
 E\left[\triangledown \triangledown^T_\gamma l_i\right] & = - \left[ \begin{array}{cc}
 							(G_i^\prime)^T V^{-1} G_i^\prime & -(G_i^\prime)^T V^{-1} h_i X_i^T \\
 							-X_i h_i^T V^{-1} G_i & h_i^T V^{-1} h_i X_iX_i^T
 						      \end{array}
 						\right]
 \end{align*}
 where $X_i = (x_{i1}, x_{i2}, ... , x_{ip})^T$, $G_i^\prime = G^\prime(\eta_{i1},\eta_{i2},...,\eta_{iJ-1})$. We can then implement Newton-Raphson method using above gradient and Hessian matrix.

 \subsection{Ordinal probit model}
 In the ordinal probit model, instead of logistic link function used in ordinal logistic regression, the probit function $\Phi(x)$ is used. Therefore the model becomes,
 \[
 \Phi^{-1}\left( \Pr(Y_i \le j) \right) = \alpha_j - \beta_1 x_{i1} - \beta_2 x_{i2} - ... - \beta_p x_{ip}
 \]
 and the link function is
 \[
 g\left([\mu_1, \mu_2, ..., \mu_{J-1}]^T\right) = \left[\Phi^{-1}(\mu_1), \Phi^{-1}(\mu_1+\mu_2), ..., \Phi^{-1}(\mu_1+\mu_2+...+\mu_{J-1})\right]^T
 \]
 Then the inverse of link function G is,
 \[
 G\left([\eta_1, \eta_2, ..., \eta_{J-1}]^T\right)  = \left[\Phi(\eta_1), \Phi(\eta_2) - \Phi(\eta_1), ..., \Phi(\eta_{J-1}) - \Phi(\eta_{J-2})\right]^T
 \]
 Its derivative matrix $G^\prime$ is
 \begin{align*}
 G^\prime & = \left[\frac{\partial \mu}{\partial \eta_1}, \frac{\partial \mu}{\partial \eta_2}, ..., \frac{\partial \mu}{\partial \eta_{J-1}}\right] \\
 & = \left[ \begin{array}{ccccc}
 	      \phi(\eta_1) & 0 & 0 & ... & 0 \\
 	     -\phi(\eta_1) & \phi(\eta_2) & 0 & ... & 0 \\
 	     0 & -\phi(\eta_2) & \phi(\eta_3) & ... & 0 \\
 	     \vdots & \vdots & \vdots & ... & \vdots \\
 	     0 & 0 & 0 & ... & \phi(\eta_{J-1})
 	   \end{array} \right]
 \end{align*}
 where $\Phi(x)$ and $\phi(x)$ are the cumulative probability and density probability of standard normal distribution.
	% When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
	% !TEX root = ../design.tex

	\chapter[Generalized Linear Models]{Generalized Linear Models}

	\begin{moduleinfo}
	\item[Author] \href{mailto:lpei@gopivotal.com}{Liquan Pei}
	\item[Author] \href{mailto:lhuang@pivotal.io}{Lei Huang}
	\item[History]
	\begin{modulehistory}
	\item[v0.1] Initial version
	\item[v0.2] Extention to multivariate response and ordinal response case
	\end{modulehistory}
	\end{moduleinfo}

	% Abstract. What is the problem we want to solve?

	\section{Introduction}

	Linear regression model assumes that the dependent variable $Y$ is equal to a linear combination $\vec{X}^\top \vec{\beta}$ and a normally distributed error term
	\begin{align*}
	Y = \vec{X}^\top \vec{\beta} + \epsilon
	\end{align*}
	where $\vec{\beta} = (\beta_{1}, \dots, \beta_{m})^\top$ is a vector of unknown parameters and $\vec{X} = (X_1, \dots, X_{m})^\top$ is a vector of independent variables.

	In a generalized linear model (GLM), the distribution of dependent variable $Y$ is a member from the \emph{exponential family} and the mean $\mu = \mathbf{E}(Y)$ depends on the independent variables $\vec{X}$ through
	\begin{align*}
	\mu = \mathbf{E}(Y) = g^{-1}(\eta) =g^{-1}(\vec{X}^\top \beta)
	\end{align*}
	where $\eta = \vec{X}^\top \beta$ is the \emph{linear predictor} and $g$ is the \emph{link function}.

	In what follows, we denote $G(\eta) = g^{-1}(\eta)$ as the inverse link function.

	\subsection{Exponential Family}
	A random variable $Y$ is a member from the exponantial family if its probability function or its density function has the form
	\begin{align*}
	f(y,\theta, \psi) = \exp \left\{ \frac{y\theta - b(\theta)}{a(\psi)} + c(y, \psi)\right\}
	\end{align*}
	where $\theta$ is the canonical parameter.
	The mean and variance of the exponential family are
	\begin{itemize}
	\item $\mathbf{E}(Y) = \mu = b'(\theta)$
	\item $\mathbf{Var}(Y) = V(\mu)a(\psi) = b''(\theta)a(\psi)$
	\end{itemize}

	\subsection{Linear Predictor}
	The linear predictor $\eta$ incorporates the information about the independent variables into the model. It is related to the expected value of the data through link functions.

	\subsection{Link Function}
	The link function provides the relationship between $\eta$, the linear predictor and $\mu$, the mean of the distribution function. There are many commonly used link functions, and their choice can be somewhat arbitrary. It makes sense to try to match the domain of the link function to the range of the distribution function's mean.

	For canonical parameter $\theta$, the canonical link function is the function that expresses $\theta$ in terms of $\eta = g(\mu)$. In what follows we treat $\theta = \theta(\eta) = h(g(\mu))$. If we choose $h$ to be an identical function, then $\theta = \eta$
	and $\mu = G(\eta) = \mu(\theta)$.

	\section{Parameter Estimation}
	We estimate unknown parameters $\beta$ by maximizing the log-likelihood of a GLM. Given the examples $\vec{Y} = (Y_1, \dots, Y_n)^\top$ and denote their mean as $\vec{\mu} = (\mu_1, \dots, \mu_{n})^\top$, the log-likelihood for a GLM is

	\begin{align*}
	l(\vec{Y}, \vec{\mu}, \psi) & =\sum_{i=1}^{n} \log f(Y_i, \theta_{i}, \psi) \\
	& =\sum_{i=1}^{n}\left\{ \frac{Y_i \theta_{i} - b\left(\theta_{i}\right)}{a\left(\psi\right)} - c(Y_i, \psi)\right\}
	\end{align*}
	where $\theta_{i} = \theta(\eta_{i}) = \theta(x_i^\top \beta)$

	Note that $a(\psi)$ and $c(Y_i, \psi)$ dose not depend on $\beta$, we then maximize
	\begin{align}
	\label{likelihood}
	\tilde{l}(\vec{Y}, \vec{\mu}) = \sum_{i = 1}^{n} \left\{ Y_i \theta_{i} - b(\theta_{i}) \right\}
	\end{align}
	with respect to $\vec{\beta}$.
	In what follows, we denote $\vec{x_i}$ to be the vector of values of independent variables for $Y_i$.
	\subsection{Iterative Reweighted Least Squares Algorithm}
	We use iterative reweighted least squares (IRLS) algorithm to find $\vec{\beta}$ that maximize $\tilde{l}(\vec{Y},\vec{\mu})$. Specifically, we use Fisher scoring algorithm which updates $\vec{\beta}$ at step $k+1$ using
	\begin{align*}
	\vec{\beta}^{k+1} = \vec{\beta}^{k} + \left\{\mathbf{E}[H(\beta^{k})]\right\}^{-1} \nabla_{\vec{\beta}}\tilde{l}(\vec{\beta}^{k})
	\end{align*}
	where $\mathbf{E}[H]$ is the mean of Hessian over examples $\vec{Y}$ and $\nabla_{\vec{\beta}}\tilde{l}$ is the gradient.
	For GLM, the gradient is
	\begin{align*}
	\nabla_{\vec{\beta}}\tilde{l} &= \sum_{i = 1}^{n}\left\{Y_i - b'(\theta_{i})\right\}\nabla_{\vec{\beta}}\theta_{i} \\
	\end{align*}
	Note that $\mu_{i} = G(\eta_{i}) = G(\vec{x_i}^\top \vec{\beta}) = b'(\theta_{i})$, we have
	\begin{align*}
	\nabla_{\vec{\beta}}\theta_{i} &= \frac{G'(\eta_{i})}{V(\mu_{i})} \vec{x_i}
	\end{align*}
	%and
	%\begin{align*}
	%\frac{\partial^2 \theta_{i}}{\partial \beta \partial \beta^{T}} = \frac{G''(\eta_{i})V(\mu_{i}) - G'(\eta_{i})^2 V'(\mu_i)}{V(\mu_{i})^2}x_i x_i^T
	%\end{align*}
	then
	\begin{align*}
	\nabla_{\vec{\beta}}\tilde{l} &= \sum_{i = 1}^{n}\left\{Y_i - \mu_{i}\right\}\frac{G'(\eta_{i})}{V(\mu_{i})}\vec{x_i}
	\end{align*}
	The Hessian is
	\begin{align*}
	H(\beta) &= \sum_{i = 1}^{n} \left\{-b''(\theta_{i}) \nabla_{\vec{\beta}}\theta_{i} {\nabla_{\vec{\beta}}\theta_{i}}^\top - \left\{Y_i - b'(\theta_{i})\right\}\nabla_{\vec{\beta}}^2 \theta_{i}\right\} \\
	& = \sum_{i = 1}^{n} \left\{ \frac{G'(\eta_{i})^2}{V(\mu_{i})} - \left\{Y_i - \mu_{i}\right\}\nabla_{\vec{\beta}}^2 \theta_{i}\right\}\vec{x_i} \vec{x_i}^\top
	\end{align*}
	Note that $\mathbf{E}[Y_i] = \mu_{i}$, we have
	\begin{align*}
	\mathbf{E}[H(\beta)] = \sum_{i = 1}^{n} \left\{\frac{G'(\eta_{i})^2}{V(\mu_{i})}\right\}\vec{x_i} \vec{x_i}^\top
	\end{align*}
	Define the weight matrix
	\begin{align*}
	\vec{W} = \text{diag}\left( \frac{G'(\eta_{1})^2}{V(\mu_{1})}, \dots, \frac{G'(\eta_{n})^2}{V(\mu_{n})} \right)
	\end{align*}
	and define
	\begin{align*}
	\vec{\tilde{Y}} = \left(\frac{Y_1 - \mu_1}{G'(\eta_{1})}, \dots, \frac{Y_n - \mu_{n}}{G'(\eta_{n})}\right)^\top
	\end{align*}
	and the design matrix
	\begin{align*}
	\vec{X}^\top = \left(\vec{x_1}, \dots, \vec{x_n}\right)
	\end{align*}
	Finally, the update rule for GLM is
	\begin{align*}
	\vec{\beta}^{k+1} &= \vec{\beta}^{k} + (\vec{X}^\top \vec{W} \vec{X})^{-1}\vec{X}^\top \vec{W}\tilde{\vec{Y}} \\
	&= (\vec{X}^\top \vec{W} \vec{X})^{-1}\vec{X^\top WZ}
	\end{align*}
	where $\vec{Z} = (Z_1, \dots, Z_n)$ is a vector of \emph{adjusted dependent variables}
	\begin{align*}
	Z_i = \vec{x_i}^\top \vec{\beta}^{k} + \frac{Y_i - \mu_{i}}{G'(\eta_{i})}
	\end{align*}
	Note that each step is the result of a weighted least square regression on the adjusted variables $Z_i$ on $x_i$ and this the reason that this algorithm is called iterative reweighted least squares.

	The IRLS algorithm for GLM is as follows
	\begin{algorithm}
	\alginput{$\vec{X}$, $\vec{Y}$, inverse link function $G(\eta)$, dispersion function $V(\mu)$ and initial values $\vec{\beta}^0$}
	\algoutput{$\vec{\beta}$ that maximize $\tilde{l}(\vec{Y}, \vec{\mu})$}
	\begin{algorithmic}[1]
	\State $k \leftarrow 0$
	\Repeat
	\State Compute $\vec{\mu}$ where $\mu_{i} = G(\eta_{i}) = G(\vec{x_i}^\top \vec{\beta}^{k})$
	\State Compute $\vec{Z}$ where $Z_i = \vec{x_i}^\top \vec{\beta}^{k} + \frac{Y_i - \mu_{i}}{G'(\eta_{i})}$
	\State Compute $\vec{W}$ where $W_{ii} = \frac{G'(\eta_{i})^2}{V(\mu_{i})}$
	\State $\vec{\beta}^{k+1} = \vec{(X^\top W X)}^{-1} \vec{X^\top WZ}$
	\Until{$\vec{\beta}^{k+1}$ converges}
	\end{algorithmic}
	\label{alg:IRLS}
	\end{algorithm}

	\subsection{Functions for contructing the exponential families}
	Table \ref{tab:glm_func} \cite{fox2008applied} provides functions $a(), b()$ and $c()$ to contruction the exponential families,

	\begin{table}[h]
	\centering
	\begin{tabular}{cccc}
	Family & $a(\psi)$ & $b(\theta)$ & $\c(y, \psi)$ \\
	\hline
	Gaussian & $\psi$ & $\theta^2/2$ & $-\frac{1}{2}\left[y^2/\psi+\log_e(2\pi\psi)\right]$ \\
	Binomial & 1/n & $\log_e(1+e^\theta)$ & $\log_eC^n_{ny}$ \\
	Poisson & 1 & $e^\theta$ & $-log_ey!$ \\
	Gamma & $\psi$ & $-\log_e(-\theta)$ & $\psi^{-1}\log_e(y/\psi) - \log_ey - \log_e\Gamma(\psi^{-1})$ \\
	Inverse-Gaussian & $-\psi$ & $\sqrt{2\theta}$ & $-\frac{1}{2}\left[\log_e(\pi\psi y^3)^3 + 1/(\psi y) \right]$ \\
	\end{tabular}
	\caption{Functions for constructing the exponential families}
	\label{tab:glm_func}
	\end{table}

	\subsection{Multivariate response family}
	\label{subsec::multinomreg}
	Instead of a single scalar number, some response variable follows a multivariate distribution (i.e. the response variable is a vector instead of a scalar number). One example is multinomial GLM where the response variable is an indictor vector containing zeros and ones to dummy code the corresponding categories. For illustration purpose, in this section, we are discussing multinomial GLM. However, other distributions can be easily extended.

	Let $J$ denote the number of categories, $y_i=(y_{i1}, y_{i2}, ..., y_{i(J-1)})^T$ be the indicator vector for $i$th subject where each $y_{ij}$ is the binary indicator whether subject $i$ is in categories $j$, $\mu_{ij}$ will be the probabilty subject $i$ is in category $j$. Therefore, we can have the log likelihood as below,

	\begin{align*}
	l & = \sum_{i=1}^{I} \left(\sum_{j=1}^{J-1} y_{ij} \log{\frac{\mu_{ij}}{1-\sum_{j=1}^{J}\mu_{ij}}} + \log(1-\sum_{j=1}^{J-1}) \right)\\
	& = \sum_{i=1}^I \left( \sum_{j=1}^{J-1} y_{ij} \theta_{ij} - \log(1+\sum_{j=1}^{J-1}\exp\theta_{ij}) \right)
	\end{align*}

	Define $b(\theta_i) = \log(1+\sum_{j=1}^{J-1}\exp\theta_{ij})$, then it can be showed $\triangledown b(\theta_i) = \mu_i$ and
	\[
	\triangledown \triangledown^T b(\theta_i) = \left( \begin{array}{cccc}
	\mu_{i1}(1-\mu_{i1}) & -\mu_{i1}\mu_{i2} & ... & -\mu_{i1}\mu_{i(J-1)} \\
	-\mu_{i2}\mu_{i1} & \mu_{i2}(1-\mu_{i2}) & ... & -\mu_{i2}\mu_{i(J-1)} \\
	\vdots & \vdots & \vdots & \vdots \\
	-\mu_{i(J-1)}\mu_{i1} & -\mu_{i(J-1)}\mu_{i2} & ... & \mu_{i(J-1)}\mu_{i(J-1)}
	\end{array} \right)
	\]
	We set $V = \triangledown \triangledown^T b(\theta_i)$

	Let $\eta_{ij} = g_j(\mu_i)$ and $g() = (g_1(), g_2(), ..., g_{J-1}())^T$ be the link map, which is $\Re^{J-1}$ to $\Re^{J-1}$. Also we have $\mu_i = G(\eta_i)$ be its inverse map. We define the derivative of $G$ to be $G^\prime = \left(\frac{\partial\mu_i}{\partial\eta_{i1}}, \frac{\partial\mu_i}{\partial\eta_{i2}}, ..., \frac{\partial\mu_i}{\partial\eta_{i(J-1)}} \right)$. For example, in multinomial logistic regression, $\eta_{ij} = \theta_{ij}$, then $G^\prime = V$.

	Denote the coefficient to be $\beta_{kj}$ where $k$ stands for the $k$th predictor and $j$ stands for the $j$th category. Then we have
	\begin{align*}
	\frac{\partial l_i}{\partial \beta_{kj}} & = (y_i - \triangledown b(\theta_i) )^T \frac{\partial \theta_i}{\partial \mu_i} \frac{\partial \mu_i}{\partial \eta_{ij}} \frac{\partial \eta_{ij}}{\partial \beta_{kj}} \\
	& = (y_i - \mu_i )^T V^{-1} G^\prime_j x_{ik}
	\end{align*}
	where $G^\prime_j$ is the $j$th column of $G^\prime$.
	\begin{align*}
	\frac{\partial^2 l_i}{\partial\beta_{kj} \partial\beta_{lh}} &= -x_{il} (G^\prime_h)^T V^{-1} \triangledown \triangledown^T b(\theta_i) V^{-1} G^\prime_j x_{ik} \\
	& = -x_{il} (G^\prime_h)^T V^{-1} G^\prime_j x_{ik}
	\end{align*}

	As a entire vector $\beta$,
	\begin{align*}
	\triangledown_\beta l_i &= \left((y_i - \mu_i)^T V^{-1} G^\prime(\mu_i) \right)^T \otimes X_i \\
	E\left[\triangledown \triangledown^T_\beta l_i\right] & = - \left( [G^\prime(\mu_i)]^T V^{-1} [G^\prime(\mu_i)] \right) \otimes (X_i X_i^T)
	\end{align*}
	where $X_i = (x_{i1}, x_{i2}, ... , x_{ip})^T$.

	Finally, we can use the below update equation for Newton-Raphson method,
	\[
	\beta^{k+1} = \beta^k + \left\{ \sum_{i=1}^I\left([G^\prime(\mu_i)]^T V^{-1} [G^\prime(\mu_i)] \right) \otimes (X_i X_i^T)\right\}^{-1}\sum_{i=1}^I\left\{\left((y_i - \mu_i)^T V^{-1} G^\prime(\mu_i) \right)^T \otimes X_i\right\}
	\]

	\subsection{Ordinal logistic regression}
	In statistics, the ordered logit model (also ordered logistic regression or proportional odds model), is a regression model for ordinal dependent variables. For example, if one question on a survey is to be answered by a choice among "poor", "fair", "good", "very good", and "excellent", and the purpose of the analysis is to see how well that response can be predicted by the responses to other questions, some of which may be quantitative, then ordered logistic regression may be used. It can be thought of as an extension of the logistic regression model that applies to dichotomous dependent variables, allowing for more than two (ordered) response categories.

	The model we implement here only applies to data that meet the proportional odds assumption, the meaning of which can be exemplified as follows.

	\[
	\log\left( \frac{\Pr(Y_i \le j)}{1-\Pr(Y_i \le j)} \right) = \alpha_j - \beta_1 x_{i1} - \beta_2 x_{i2} - ... - \beta_p x_{ip}
	\]
	where $\Pr(Y_i \le j)$ is the cumulative probability that the $i$th subject belongs to the first $j$th categories; $\alpha_j$ is category-specific intercept and $\beta_k$ is feature-specific coefficient. Using the notation in the subsection \ref{subsec::multinomreg}, the link function is
	\[
	g\left([\mu_1, \mu_2, ..., \mu_{J-1}]^T\right) = \left[\log\left(\frac{\mu_1}{1-\mu_1}\right), \log\left(\frac{\mu_1+\mu_2}{1-\mu_1-\mu_2}\right), ..., \log\left(\frac{\mu_1+\mu_2+...+\mu_{J-1}}{1-\mu_1-\mu_2-...-\mu_{J-1}}\right)\right]^T
	\]
	Then the inverse of link function G is,
	\begin{align*}
	G\left([\eta_1, \eta_2, ..., \eta_{J-1}]^T\right) & = \left[\frac{\exp(\eta_1)}{1+\exp(\eta_1)}, \frac{\exp(\eta_2)}{1+\exp(\eta_2)} - \frac{\exp(\eta_1)}{1+\exp(\eta_1)},\right. \\
	& \left. ..., \frac{\exp(\eta_{J-1})}{1+\exp(\eta_{J-1})} - \frac{\exp(\eta_{J-2})}{1+\exp(\eta_{J-2})}\right]^T
	\end{align*}
	Its derivative matrix $G^\prime$ is
	\begin{align*}
	G^\prime & = \left[\frac{\partial \mu}{\partial \eta_1}, \frac{\partial \mu}{\partial \eta_2}, ..., \frac{\partial \mu}{\partial \eta_{J-1}}\right] \\
	& = \left[ \begin{array}{ccccc}
	\frac{\exp(\eta_1)}{(1+\exp(\eta_1))^2} & 0 & 0 & ... & 0 \\
	-\frac{\exp(\eta_1)}{(1+\exp(\eta_1))^2} & \frac{\exp(\eta_2)}{(1+\exp(\eta_2))^2} & 0 & ... & 0 \\
	0 & -\frac{\exp(\eta_2)}{(1+\exp(\eta_2))^2} & \frac{\exp(\eta_3)}{(1+\exp(\eta_3))^2} & ... & 0 \\
	\vdots & \vdots & \vdots & ... & \vdots \\
	0 & 0 & 0 & ... & \frac{\exp(\eta_{J-1})}{(1+\exp(\eta_{J-1}))^2}
	\end{array} \right]
	\end{align*}

	Define $h_i = \sum_{j=1}^{J-1} \frac{\partial \mu_i}{\partial \eta_{ij}}$, then as a entire coefficient vector $\gamma = [\alpha_1, \alpha_2, ..., \alpha_{J-1}, \beta_1, \beta_2, ..., \beta_p]^T$,
	\begin{align*}
	\triangledown_\gamma l_i &= \left[ \begin{array}{c}
	\left\{(y_i - \mu_i)^T V^{-1} G_i^{\prime} \right\}^T \\
	-(y_i - \mu_i)^T V^{-1} h_i X_i
	\end{array} \right] \\
	E\left[\triangledown \triangledown^T_\gamma l_i\right] & = - \left[ \begin{array}{cc}
	(G_i^\prime)^T V^{-1} G_i^\prime & -(G_i^\prime)^T V^{-1} h_i X_i^T \\
	-X_i h_i^T V^{-1} G_i & h_i^T V^{-1} h_i X_iX_i^T
	\end{array}
	\right]
	\end{align*}
	where $X_i = (x_{i1}, x_{i2}, ... , x_{ip})^T$, $G_i^\prime = G^\prime(\eta_{i1},\eta_{i2},...,\eta_{iJ-1})$. We can then implement Newton-Raphson method using above gradient and Hessian matrix.

	\subsection{Ordinal probit model}
	In the ordinal probit model, instead of logistic link function used in ordinal logistic regression, the probit function $\Phi(x)$ is used. Therefore the model becomes,
	\[
	\Phi^{-1}\left( \Pr(Y_i \le j) \right) = \alpha_j - \beta_1 x_{i1} - \beta_2 x_{i2} - ... - \beta_p x_{ip}
	\]
	and the link function is
	\[
	g\left([\mu_1, \mu_2, ..., \mu_{J-1}]^T\right) = \left[\Phi^{-1}(\mu_1), \Phi^{-1}(\mu_1+\mu_2), ..., \Phi^{-1}(\mu_1+\mu_2+...+\mu_{J-1})\right]^T
	\]
	Then the inverse of link function G is,
	\[
	G\left([\eta_1, \eta_2, ..., \eta_{J-1}]^T\right) = \left[\Phi(\eta_1), \Phi(\eta_2) - \Phi(\eta_1), ..., \Phi(\eta_{J-1}) - \Phi(\eta_{J-2})\right]^T
	\]
	Its derivative matrix $G^\prime$ is
	\begin{align*}
	G^\prime & = \left[\frac{\partial \mu}{\partial \eta_1}, \frac{\partial \mu}{\partial \eta_2}, ..., \frac{\partial \mu}{\partial \eta_{J-1}}\right] \\
	& = \left[ \begin{array}{ccccc}
	\phi(\eta_1) & 0 & 0 & ... & 0 \\
	-\phi(\eta_1) & \phi(\eta_2) & 0 & ... & 0 \\
	0 & -\phi(\eta_2) & \phi(\eta_3) & ... & 0 \\
	\vdots & \vdots & \vdots & ... & \vdots \\
	0 & 0 & 0 & ... & \phi(\eta_{J-1})
	\end{array} \right]
	\end{align*}
	where $\Phi(x)$ and $\phi(x)$ are the cumulative probability and density probability of standard normal distribution.