doc/design/modules/cox-proportional-hazards.tex - madlib - Git at Google

 % When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
 % !TEX root = ../design.tex

 \chapter[Cox Proportional-Hazards]{Cox Proportional-Hazards}

 \section{Introduction}
 % Abstract. What is the problem we want to solve?
 Proportional-Hazard models enable comparison of \textit{survival models}.
 Survival models are functions describing the probability of an one-item event
 (prototypically, this event is death) with respect to time.  The interval of
 time before death occurs is the \textit{survival time}.   Let $T$ be a random
 variable representing the survival time, with a cumulative probability function
 $P(t)$.  Informally, $P(t)$ represents the probability that death has happened
 before time $t$.

 An equivalent formation is the \textit{survival function} $S(t)$, defined as
 $S(t) \equiv 1 - P(t)$.  Informally, this is the probability that death hasn't
 happened by time $t$.  The \textit{hazard function} $h(t)$ which assesses the
 instantaneous risk of demise at time $t$, conditional on survival upto time $t$.
 \begin{align}
     h(t) &= \lim_{\Delta t \rightarrow 0} \frac{p\left(t \le T < t + \Delta t | T \ge t \right)}
                                               {\Delta t} \\
          &= \lim_{\Delta t \rightarrow 0}  \frac{1}{\Delta t}\frac{p(T < t + \Delta t) - p(T < t)}{P(T \ge t)}
 \end{align}

 The relationship between $h(t)$ and $S(t)$, using $S(t) = 1 - p(T < t)$ is
 \begin{align}
 h(t) & = \lim_{\Delta t \rightarrow 0}  \frac{1}{\Delta t} \frac{\left(S(t + \Delta_t) - S(t)\right)}{-S(t)}\\
      & = \frac{-S'(t)}{S(t)}
 \end{align}
 where
 $$S'(t) = \lim_{\Delta t \rightarrow 0}  \frac{S(t + \Delta t) - S(t)}{\Delta t}$$
 denotes the derivative of $S(t)$.

 In the simplest case, the Cox model assumes that $h(t)$ is
 \begin{equation}
 h(t) = e^{\alpha(t)}
 \end{equation}

 where exp$(\alpha(t))$ is the \textit{baseline function}, which depends only on
 $t$.  However, in many applications, the probability of death may depend on more
 than just $t$.  Other covariates, such as age or weight, may be important.  Let
 $x_i$ denote the observed value of the  $i$th covariate, then the Cox model is
 written as

 \begin{equation}
 h(t) = e^{\alpha(t)} e^{\beta_1 x_1 + \beta_2 x_2 + \dots + \beta_m x_m} = e^{\alpha(t)} e^{\mathbf{\beta^T x}}
 \end{equation}
 where $\beta_i$ is the coefficient associated with the $i$th covariate.

 Many applications take values from multiple observations, measuring the values
 of $x_i$ for each observation.  %The $j$th observation has the hazard function

 %\begin{equation}
 %h_j(t) = e^{\alpha(t)} e^{\beta_1 x_{j1} + \beta_2 x_{j2}+ \dots + \beta_k x_{jk}}= e^{\alpha(t)} e^{\mathbf{\beta^T x_j}}.
 %\end{equation}

 In the \textit{proportional-hazard model}, the hazard functions of two
 observations $j$ and $k$ are compared. The ratio of the two is
 \begin{equation}
 \frac{h_j(t)}{h_k(t)} = \frac{e^{\alpha(t)} e^{\mathbf{\beta^T x_j}} }{e^{\alpha(t)} e^{\mathbf{\beta^T x_k}} } = \frac{e^{\mathbf{\beta^T x_j} }}{ e^{\mathbf{\beta^T x_k}}}
 \end{equation}

 The critical idea here is that the ratio of the two hazard functions is
 completely independent of the baseline function.  This allows meaningful
 comparisons between samples without knowing the baseline function, which may be
 difficult to measure or specify.

 % SECTION:  Applications
 \section{Applications}\label{cox:Application}
 Generally, applications start with a list of $n$ observations, each with $m$
 covariates and a time of death.  From this $n \times (m+1)$ matrix, we would
 like to derive the correlation between the covariates and the hazard function.
 This amounts to finding the values of  $\beta$.

 The values of $\beta$ can be estimated with the method of \textit{partial
 likelihood}.  This method begins by sorting the observations by time of death
 into a list $[t_1, t_2, \dots, t_n]$ such that $t_i \le t_j : i < j\ \forall
 i,j$.  For any time $t_i$, let $R(t_i)$ be the set of observations still alive
 at time $t_i$.

 Given that there was a death at time $t_i$ and observation $k$ was alive prior
 to $t_i$, the probability that the death happened to observation $k$  is
 \begin{equation}\label{cox:equ:death-prob}
 \Pr(T_k = t_i | R(t_i)) =  \frac{e^{\mathbf{\beta^T x_k} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}}.
 \end{equation}

 The \textit{partial likelihood function} can now be generated as the product of conditional probabilities.
 More formally,
 \begin{equation}\label{cox:equ:likelihood}
 \mathcal{L} = \prod_{i = 1}^n \left(  \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right).
 \end{equation}

  The log-likelihood form of this equation is
 \begin{equation}\label{cox:equ:LLH}
 L = \sum_{i = 1}^n \left[  \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right].
 \end{equation}

 An estimation of $\beta$ can be found by simply maximizing this log-likelihood.
 To maximize the likelihood, it helps to have the derivative of equation
 \ref{cox:equ:LLH}, which is
 \begin{equation}\label{cox:equ:LLH derivative}
 \frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{ik} - \frac{\sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } } \right).
 \end{equation}
 It follows that the second derivative is
 \begin{equation}\label{cox:equ:LLH second derivative}
 \frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n
             \left(
                 \frac{\left(  \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } \right)
                             \left(  \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x_j} } \right)}
                      {\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 } -
                 \frac{  \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }
                      {\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }
             \right).
 \end{equation}

 % Incomplete Data
 \subsection{Incomplete Data}
 Frequently, not every observation will have an associated time of death.
 Typically, this arises when the period of observation terminates before the
 entire population being studied has died.  This is known as \textit{censoring}
 the data.  To account for this, an additional indicator variable is introduced
 $\delta_i$, which is set to 1 if the $i$th observation has an associated time of
 death, and 0 otherwise.

 Incorporating this indicator variable into equation \ref{cox:equ:likelihood} gives
 \begin{equation}\label{cox:equ:likelihood-censoring}
 \mathcal{L} = \prod_{i = 1}^n \left(  \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right)^{\delta_i}.
 \end{equation}
 The appropriate changes to the LLH function and its derivatives are trivial.


 % SECTION: Implementation
 \subsection{Partition and aggregation of the data to speed up}

 In order to speed up the computation, we first partition the data and
 aggregate each piece of the data into one big row. During the
 computation, the whole big row is loaded into the memory for
 processing, which speeds up the computation.

 When we partition the data, we want to (1) keep the sorted descending
 order of the time column, (2) make sure that each piece has
 approximately the same amount of data so that the work load is even,
 and (3) do this as fast as possible.

 Our solution to to first sample a certain amount of the data, and then
 compute the approximate break points using the sampled data. The
 sampled data should be small enough to load into the memory, abd also
 large enough so that the break points can be computed relatively
 accurate.

 After we have partitioned the data, we aggregate each partition into
 one big row. The order of the data should be kept during the
 aggregation.

 Then we use the sequential algorithm described in the next to process
 the new data row by row in the order of the time column. Since each
 big row contains lots of original rows, and we deal with them all in
 the memory, this can speed up the computation.


 \subsection{Implementation of Newton's Method}
 Newton's method is the most common choice for estimating $\beta$ by minimizing
 \ref{cox:equ:likelihood} using the following update rule:
 \begin{equation}
 \beta_{k} = \beta_{k} - \alpha_k \left( {\nabla^2 L}^{-1} \nabla L \right)
 \end{equation}
 where $\alpha_k$ is a positive scalar denoting the step length in the newton
 direction ${\nabla^2 L}^{-1} \nabla L$ determined using the first and second
 derivative information. We would like to emphasize that the problems we are
 designing this system for are those with many records and few features i.e. $n
 \gg m$, thereby keeping the inverse operation on the hessian matrix relatively
 cheap.

 The gradient and Hessian matrix may be hard to parallelize therefore reducing an
 advantage for large number of observations. To elaborate, consider equations
 \ref{cox:equ:LLH derivative} and \ref{cox:equ:LLH second derivative} which are
 sums with independent terms. One might think it is natural to reduce the
 computational by parallelization. Efficient parallelization may be achieved if
 each term could be computed independently in parallel by a set of worker tasks
 and a master task could collect the output from each worker node sum them
 together. However, this might not work well in this setting. To see why,
 consider parallelizing equation \ref{cox:equ:LLH second derivative}.  Each
 worker task is given one term in the sum, which looks like
 \begin{equation}
  \frac{\left(  \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x _j} } \right) \left(  \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x _j} } \right)}{\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 }   -  \frac{  \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }.
 \end{equation}

 Note that the sums in the numerator are summing over all the data points in the
 data matrix. A similar such issue is encountered while computing the first
 derivative terms as defined in \ref{cox:equ:LLH derivative}. However, we note
 that this sum has a structure that allows it to be computed in linear time (with
 respect to the number of data points) using the following quantities.
 \begin{align}
 H_{i} &=   \sum_{j \in R(t_i)} x_{j} e^{\mathbf{\beta^T x_j}}\\
 S_{i}&=   \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}} \\
 V_{i}&=   \sum_{j \in R(t_i)} x_{j}x_{j}^T e^{\mathbf{\beta^T x_j} }
 \end{align}

 Note that $H_{i}$ is a column vector with $m$ elements ( $H_{i}\in
 \mathbb{R}^m$), $S_{i}$ is a scalar and and $V_{i}$ is an $m \times m$ matrix.
 We can now write the first derivative of the maximum likelihood estimator,
 defined in Equation \ref{cox:equ:LLH derivative} as
 \begin{align}
 \frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{i} - \frac{H_{i} }{ S_{i}}  \right)
 \end{align}
 while the second derivative, defined in Equation \ref{cox:equ:LLH second
 derivative}, can be reduced to

 \begin{align}
 \frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n \left( \frac{H_{i}H_{i}^T }{ S_{i}^2 } -  \frac{V_{i}}{ S_{i} } \right)
 \end{align}
 Since we assume that the data points are sorted in increasing order i.e
 $R(t_i) = \{i, i+1 \ldots n \}$, we can calculate the above summation as
 \begin{align}
 H_{i}& =   H_{i+1} +  x_{i} e^{\mathbf{\beta^T x_i}} \label{eq:numerator_avg}\\
 S_i & = S_{i+1} + e^{\mathbf{\beta^T x_i}} \label{eq:denominator_avg}\\
 V_i & = V_{i+1} +  \frac{H_{i}H_{i}^T }{ S_{i}^2 } -  \frac{V_{i}}{
   S_{i}} \label{eq:hessian}.
 \end{align}
 With this recurrence relationship, the hessian matrix and the gradient direction
 can be computed in linear time.

 %In addition to the computational expense of computing the Hessian, the inverse
 %Hessian must be computed as well.  Unfortunately, matrix inversion is an
 %expensive operation, and  cannot be parallelized.

 % SECTION: Stratification Support
 \section{Stratification Support}\label{cox:stratified}
 A crucial component of the Cox Proportional Hazards model is the proportional hazards assumption:
 The hazard for a given individual is a fixed proportion of the hazard for any
 other individual in the same stratum, and the ratio of the hazards is constant
 across time.

 In actual use cases, the proportional hazard assumption may not be satisfied if
 we use all independent variables as covariates. A stratified Cox regression
 model may then be useful. It offers a way such that we can choose a subset of
 independent variables as covariates while are still taking the remaining
 independent variables into account. The stratified Cox regression is available
 in both R~\cite{rcox} and Stata~\cite{statacox}.

 Stratification is used as shorthand for building a Cox model that allows for
 more than one stratum, and hence, allows for more than one baseline hazard
 function.  Stratification provides two pieces of key, flexible functionality for
 the end user of Cox models:
 \begin{enumerate}
     \item Allows a categorical variable Z to be appropriately accounted for in
     the model without estimating its predictive/associated impact on the
     response variable (i.e. without estimating Z's ``coefficient'').
     \item Categorical variable Z is predictive/associated with the response
     variable, but Z may not satisfy the proportional hazards assumption.
 \end{enumerate}

 To explicitly clarify how stratification differentiates from grouping support:
 \begin{itemize}
     \item Grouping by a categorical column would build a completely separate
     Cox model for each value of the categorical column, where the baseline
     hazards for each value of the categorical column would be different and
     the estimated coefficient values for each explanatory variable would be
     different for each value of the categorical column.
     \item Stratifying by a categorical column would build a single
     common Cox model, where the baseline hazards for each value of the
     categorical column would be different, but the estimated coefficient values
     for each explanatory variable would be the same for each value of the stratum.
 \end{itemize}

 It is valuable to emphasize that all strata share all coefficients, and that the
 only difference between strata is the baseline hazard.  In other words,
 coefficients for all non-strata explanatory variables are identical across the
 different strata.

 \subsection{Estimating A Stratified Cox Model}\label{cox:estimate-stratified}
 The parameter estimation is done by maximizing the product of likelihoods, each
 from a stratum~\cite{stratifiedethzslides}.

 Given $n$ observations, each with $m$ covariates and each in one of $K$
 strata~\footnote{Note that this does not mean that we have $K$ variables other
 than the $m$ covariates, but $K$ groups classified by strata ID variables.}, let
 $\mathit{ST}_{k}$ denote the set of observations in the $k$-th stratum.

 Because, as an objective function, the sum of log-likelihoods is equivalent to
 the product of likelihoods, according to Equation \ref{cox:equ:LLH}, we have the
 log-likelihood associated with the $k$-th stratum,
 \begin{equation}\label{cox:equ:obj-each-stratum}L_{k} = \sum_{i \in \mathit{ST}_{k}} \left[  \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right],
 \end{equation}
 where $R_k(t_i)$ the set of observations in $\mathit{ST}_{k}$ and still alive at time $t_i$.

 Therefore, the objective function of stratified cox regression can be expressed as
 \begin{equation}\label{cox:equ:stratified-obj}
 L^{\mathit{stratified}} = \sum_{k=1}^{K} L_{k} = \sum_{k=1}^{K} \sum_{i \in \mathit{ST}_{k}} \left[  \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right].
 \end{equation}
 The appropriate changes to gradient, Hessian and censoring are trivial.

 \subsection{When We Need A Stratified Cox Model?}\label{cox:diagnostic}
 General practice in standard statistical packages (i.e. R, SAS) is to
 make use of the Schoenfeld residuals to gauge whether or not the
 proportional hazards assumption has been violated.

 % To this end, we need to investigate the implementation of R's
 % \texttt{cox.zph()} function for MADlib.
 % There are several parametric options available in the
 % \texttt{cox.zph()} function, but we prioritize implementation of
 % \texttt{cox.zph()} with its default parameter settings.
 % The output of \texttt{cox.zph()} contains 3 numbers for each
 % covariate, scaled Schoenfeld residual, chi-square, and p-value.
 % The actual formulas of computing these are still to be investigated.

 The Schoenfeld residuals are centered on zero and should be
 independent of time if PHA (proportional hazard assumption) is
 true. Deviations from this, i.e. residuals that exhibit some trend in
 time, indicate that the PHA is violated.

 The Schoenfeld residuals, at the time when a failure or death were to occur, are
 defined by the difference between the observed and expected covariate values at
 that time. Also note that the Schoenfeld residuals are zeroes for censored
 observations.
 \begin{equation}
 \hat{\vec{r}}_i = \vec{x}_i - E[\vec{x}_i]\ ,\mbox{ only for
 }\delta_i=1\ ,
 \end{equation}

 To compute the expected values at time $t_{i}$, we use the probability distribution given by Equation \ref{cox:equ:death-prob}.
 \begin{eqnarray}
 E[\vec{x}_i] &=& \sum_{k\in \mathcal{R}(t_i)}\vec{x}_k p(k\mbox{ dies at
 }t_i) \nonumber\\
 &=& \frac{\sum_{k\in \mathcal{R}(t_i)}\vec{x}_k
   e^{\vec{\beta}\vec{x}_k}}{\sum_{j\in
     \mathcal{R}(t_i)}e^{\vec{\beta}\vec{x}_j} },
 \end{eqnarray}
 where the values of $\vec{\beta}$ are the fitted coefficients, and $\mathcal{R}(t)$ is the set of individuals that are still alive at time $t$.

 \subsubsection{Scaled Schoenfeld Residuals}\label{cox:scaled-residual}
 Suggested by Grambsch and Therneau \cite{grambsch1994proportional} and also
 followed by statistical softwares \cite{testph}, scaling the Schoenfeld
 residuals by an estimator of its variance yields greater diagnostic power. The
 scaled Schoenfeld residuals is
 \begin{eqnarray}
 \hat{\vec{r}}^{*}_i &= [\mathit{Var}(\hat{\vec{r}}_i)]^{-1} \hat{\vec{r}}_i\\
 &\approx m \mathit{Var}(\hat{\vec{\beta}}) \hat{\vec{r}}_i  ,
 \end{eqnarray}
 where $\mathit{Var}$ denotes a covariance matrix, and $m$ is the
 number of uncensored survival times. $\hat{\vec{r}}_i$ is a length-$n$
 vector, and $\mathit{Var}(\hat{\vec{\beta}})$ is a $n\times n$
 matrix, where $m$ is the number of uncensored data points and $n$ is
 the number of features.

 \subsubsection{Transformation of Time}\label{cox:transform}
 Transformation of the time values often helps the analysis of the correlation
 between the scaled Schoenfeld residuals and time. Common transformation methods
 include ranking, computing log, and Kaplan-Meier's method. (We don't fully
 understand the last one yet.)

 \footnote{The \texttt{cox.zph()} function in R allows different options for
 transformation of times: \texttt{``km''}, \texttt{``rank''},
 \texttt{``log''} and \texttt{``identity''}.
 \texttt{``km''} stands for Kaplan-Meier's method.
 \texttt{``rank''} takes the ranks of the times instead of times.
 \texttt{``log''} uses the logarithm of times.
 And \texttt{``identity''} does nothing and directly uses times.\cite{coxzph}}

 %To quantify whether there is a trend in the Schoenfeld residuals over
 %time, we can compute the correlation between the scaled residuals and the
 %ranks of the times, where the scaled residuals are given by
 %\begin{equation}
 %\hat{\vec{r}}_i = m \widehat{\mbox{Var}}(\vec{\beta}) \vec{r}_i\ ,
 %\end{equation}
 %where $m$ is the number of uncensored survival times, and
 %$\widehat{\mbox{Var}}(\vec{\beta})$ is the estimated variance matrix for
 %the fitted coefficients.

 \subsubsection{$p$-values}
 The process for computing the $p$-values of the correlation in R's
 \texttt{survival} package is given
 in the following.

 Let $m$ be the number of uncensored data points, $n$ be the number of
 features, $\hat{\vec{t}}$ be the transformed survival time, which is a
 length-$m$ vector,
 $\mathit{Var}(\hat{\vec{\beta}})$ be the variance matrix for the
 fitted coefficients which is a $n\times n$ matrix, $\hat{\vec{R}}$
 is the unscaled Schoenfeld residuals, which is a $m\times n$ matrix.
 \begin{eqnarray}
 \vec{w} &=& \hat{\vec{t}} - \bar{\hat{t}}\ , \mbox{ a length-}m \mbox{
   vector}\ ,\\
 \vec{v} &=& m\,\vec{w}^T\hat{\vec{R}}\mathit{Var}(\hat{\vec{\beta}})\ , \mbox{
 a length-}n\mbox{ vector}\\
 z_i &=& \frac{1}{m\,\vec{w}^T\vec{w}}
 \cdot\frac{v^2_i}{\left[\mathit{Var}(\hat{\vec{\beta}})\right]_{ii}}\
 , \mbox{ for }i=1,\dots,n\ ,\\
 p_i &=& 1 - \chi_1^2(z_i), \mbox{ for }i=1,\dots,n\ .
 \end{eqnarray}
 Here $\bar{\hat{t}}$ is the average of the transformed survival
 times. $z_i$ is the z-stats for $i$-th coefficient. $p_i$ is the
 p-value for $i$-th coefficient. We need a separate function to compute
 $\vec{w}$, but both $\vec{v}$ and $\vec{w}^T\vec{w}$ can be computed
 in an aggregate function. $z_i$ and $p_i$ can be computed in the final
 function of that aggregate function. $\chi^2_1(\cdot)$ is the
 chi-square function with the degree of freedom being $1$.

 \subsection{Implementation}
 We can use the iteration equations Eqs. (\ref{eq:numerator_avg},
  \ref{eq:denominator_avg}, \ref{eq:hessian}) to compute the residuals and the
 hessian, which is needed by the variance matrix.

 The current implementation uses an ordered aggregate on the data to
 compute these quantities, which is not in parallel. To enable a distributed
 solution we can use the ``GROUP BY'' functionality provided by SQL to enable the
 independent computation of the log-likelihood function in each distributed segment
 corresponding to each strata (`transition' function). These values can then be added
 across the segments (`merge' function), with the gradient for the parameter computed
 on the final sum (`final' function).

 \subsection{Resolving Ties}
 In ``coxph\_train'', Breslow method is used to resolve ties, which uses
 as the partial likelihood~\cite{hosmer2011applied}
 \begin{eqnarray}
   L &=& \sum_{i=1}^{m}\left[ \vec{\beta}^{T} \vec{x}_{i+} -
     d_i \log\left( \sum_{j \in R(t_i)} e^{\vec{\beta}^T
         \vec{x}_j}\right) \right]\\
   &=& \sum_{i=1}^{m}\left[ \vec{\beta}^{T} \vec{x}_i - \log\left( \sum_{j \in R(t_i)} e^{\vec{\beta}^T
         \vec{x}_j}\right)\right]_{+}
 \end{eqnarray}
 where $d_i$ is the number of observations that have the same $t_i$,
 and $x_{i+}$ is the sum of the covariants over all observations that
 have the same $t_i$. $m$ is the number of distinct survival
 times. Here $[\cdot]_{+}$ means sum over all observations with the
 same survival time.

 \subsection{Robust Variance Estimators}
 In MADlib, we implement robust variance estimator devised by
 Lin and Wei~\cite{lin1989robust}.
 With our notations above, let
 \begin{eqnarray*}
     \vec{W}_{i}
     &=& \delta_i \cdot \left[ \vec{x_i} - \frac{\left( \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} \vec{x_l} \right)}{\sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}}} \right]
     - \sum_{j: t_j \le t_i} \left\{ \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{\sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}}}
     \cdot \left[ \vec{x_i} - \frac{\left( \sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}} \vec{x_k} \right)}{\sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}}} \right] \right\}.\\
     &=& \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right]
     - \sum_{j: t_j \le t_i} \left\{ \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j}
     \cdot \left[ \vec{x_i} - \frac{\vec{H}_j}{S_j} \right] \right\}.\\
     &=& \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right]
     - \sum_{j: t_j \le t_i} \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} \vec{x_i}
     + \sum_{j: t_j \le t_i} \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} \frac{\vec{H}_j}{S_j}.\\
 \end{eqnarray*}
 where
 \begin{eqnarray*}
 H_i &=&  \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} \vec{x_l}\\
 S_i &=& \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}}
 \end{eqnarray*}

 Let
 \begin{eqnarray*}
     A_i &=& \sum_{j: t_j \le t_i} \frac{\delta_j}{S_j},\\
     \vec{B}_i &=& \sum_{j: t_j \le t_i} \frac{\delta_j \vec{H}_j}{S_j^2},\\
 \end{eqnarray*}
 we have
 \[
 \vec{W}_i = \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right] - e^{\vec{\beta}^T \vec{x_i}} A_i \vec{x_i} + e^{\vec{\beta}^T \vec{x_i}} \vec{B}_i.
 \]
 And the recursions of $A_i$ and $\vec{B}_i$ are given if we assume time $t_i$ is
 sorted ascending order
 \begin{eqnarray*}
     A_i &=& A_{i-1} + \frac{\delta_i}{S_i}\\
     \vec{B}_i &=& \vec{B}_{i-1} + \frac{\delta_i \vec{H}_i}{S_i^2}
 \end{eqnarray*}

 The meat part of the sandwich estimator is
 \begin{equation}
   \mathbb{M} = \sum_{i=1}^n\vec{W}_i\vec{W}_i^T\ .
 \end{equation}

 \subsection{Clustered Variance Estimation}

 The data has multiple clusters $m=1,\dots,K$, and the meat part is
 \begin{equation}
   \mathbb{M} = \mathbb{A}^T\mathbb{A}\ ,
 \end{equation}
 where the matrix $\mathbb{A}$'s $m$-th row is given by
 \begin{equation}
   \mathbb{A}_m = \sum_{i\in G_m} \vec{W}_i\ .
 \end{equation}
 Here $G_m$ is the set of rows of that belong to the $m$-th cluster.

 With stratification, we take into account of the strata only when we
 compute $\vec{H}_i$, $S_i$, $A_i$, $\vec{B}_i$ and $\vec{W}_i$. The
 calculations of $\mathbb{A}_m$ and $\mathbb{M}$ only need to group by
 the clustered variables and the strata variables are irrelavent.

 \section{How to prevent under/over -flow errors ?}

 A problem that is not mentioned above but appears in the real
 applications of the CoxPH training model is the under-flow or
 over-flow errors. We have exponential functions in the computation,
 and it is very easy to get under-flow or over-flow errors if the
 coefficients become too small or large at certain steps.

 We use the same method as R's "survival" package to deal with the
 possible under-flow/over-flow errors. This methods contains three
 parts, which makes the algorithm described above even more
 complicated:

 (1) center and scale the independent variables.
 \begin{equation}
     x_i \rightarrow \frac{x_i - E[x]}{E[\vert x_i - E[x] \vert]}
 \end{equation}

 (2) Estimate the maximum possible value of all the coefficients using
 the coefficients computed from the first iteration.
 \begin{equation}
     \beta_k^{(max)} = 20 \, \sqrt{\frac{h_{kk}}{\sum_i \delta_i}}\ ,
 \end{equation}
 where $\beta_k^{(max)}$ is the estimate of the possible maximum value
 of the coefficient $\beta_k$, $h_{kk} = \partial^2 L / \partial
 \beta_k^2$ is the diagonal elements of the Hessian matrix, and
 $\delta_i=0,1$ is the censoring status of the records.

 During the computation, whenever the coefficient $\vert \beta_k \vert
 > \beta_k^{(max)}$, we set $\beta_k \rightarrow \mbox{sign}(\beta_k)
 \beta_k^{(max)}$.

 The authors of "survival" package explains in
 http://stat.ethz.ch/R-manual/R-devel/RHOME/library/survival/doc/sourcecode.pdf
 why they use such an estimate for the coefficients:

 "We use a cutpoint of $\beta * \mbox{std}(x) < 23$
 where the standard deviation is the average standard deviation of $x$ within a risk
 set. The rationale is that $e^{23}$ is greater than the current world
 population, so such a coefficient corresponds to a between subject
 relative risk that is larger than any imaginable."

 In their implementation, they also use $1/h_{kk}$ as an approximation
 to $\mbox{std}(x)$. And besides $23$, they also use $20$ in the
 estimate.

 (3) Although (1) and (2) stabalize the computation, it is still not
 enough. Step-halving method is used. Whenever the current iteration's
 log-likelihood is smaller than that of previous iteration, we accept
 the coefficients as
 \begin{equation}
     \beta_k = \frac{1}{2}(\beta_k^{new} + \beta_k^{old})
 \end{equation}

 (4) The stopping threshold is
 \begin{equation}
     1 - \frac{L_{new}}{L_{old}} < \mbox{threshold}\ .
 \end{equation}

 \section{Marginal Effects}

 See \ref{sub:marginal_effects} for an introduction to marginal effects
 (all notations below are the same as those defined in \ref{sub:marginal_effects}).
 We implement the default algorithm used by Stata 13.1 for computing the
 marginal effects. Note that older versions of Stata may use a
 different default algorithm to compute values different from MADlib.

 \subsection{Basic Formulation}

 The relative hazard ratio for $i$-th record is given by
 \begin{equation}
     h_i = h_0 \exp \left( \vec{\beta}\cdot\vec{f} \right)\ ,
 \end{equation}
 where $h_0$ is the baseline and both $\vec{\beta}$ and
 $\vec{f}$ are vectors.  Here we use the indices
 $i$ or $j$ for the data records, and $a$ or $b$ for the indices of
 covariate terms in $\vec{\beta}\cdot \vec{f}(\vec{x}_i)$. And we will use
 $u$ or $v$ to denote the indices of $\vec{x}$.

 The value of the baseline $h_0$ is arbitrary and difficult to compute. Strata
 ignores the baseline hazard value for the computation of marginal effect. For
 MADlib, we follow the same principle and ignore the baseline (i.e. set baseline
 as 1) to compute hazard value.

 Thus the marginal effect corresponding to variable $x_k$ is computed as,
 \begin{align*} \label{eq:me1}
     \mathit{ME}_{k} & = \pder[h_i]{x_k} \\
            & =  \pder[e^{\vec{\beta} \vec{f}}]{x_k} \\
            & = e^{\vec{\beta} \vec{f}} \vec{\beta} \pder[f]{x_k}.
 \end{align*}
 Vectorizing the above equation (similar to \ref{ssub:logistic_regression}) gives
 \begin{equation*}
   \mathit{ME} = e^{\vec{\beta} \vec{f}} J^T \vec{\beta},
  \end{equation*}
 where $J$ is defined in \ref{sub:marginal_effects_for_regression_methods}.


 Censoring status and stratification are irrelavant to the marginal effect
 calculation and can be ignored.

 \subsection{Categorical variables}

 For categorical variables, we compute the discrete difference as described in
 \ref{sub:categorical_variables}.
 The discrete difference with respect to $x_k$ is given as
 \begin{align*}
     \mathit{ME_k} &= h^{set} - h^{unset} \\
                   &= e^{\vec{\beta}\vec{f^{set}}} - e^{\vec{\beta}\vec{f^{unset}}}
 \end{align*}

 \subsection{Standard Error}

 As has already been described in \ref{sub:standard_errors}, the method to compute
 the standard errors is
 \begin{equation}
     \mbox{Var}(ME) = \mathbb{S}\, \mbox{Var}(\vec{\beta})\,
     \mathbb{S}^T,
 \end{equation}
 where the matrix $\mathbb{S}$, computed as the partial derivative of
 the marginal effect over all the coefficients, is a $M\times N$ matrix $S_{mn}
 = \pder[\mathit{ME}_m]{\beta_n}$

 \begin{align*}
     S_{mn} & = \pder[\mathit{ME}_m]{\beta_n} \\
            & = \pder{\beta_n} \left( e^{\vec{\beta} \vec{f}} \vec{\beta} \pder[\vec{f}]{x_m}  \right)\\
            & = e^{\vec{\beta} \vec{f}} \pder{\beta_n}\left(\vec{\beta} \pder[\vec{f}]{x_m} \right) +
                 \pder[e^{\vec{\beta} \vec{f}}]{\beta_n} \vec{\beta} \pder[\vec{f}]{x_m} \\
            & = e^{\vec{\beta} \vec{f}} \pder[f_n]{x_m} +
                 e^{\vec{\beta} \vec{f}} \cdot f_n \cdot \vec{\beta} \pder[\vec{f}]{x_m} \\
            & = e^{\vec{\beta} \vec{f}} \left( \pder[f_n]{x_m} +
                   f_n \cdot \vec{\beta} \pder[\vec{f}]{x_m} \right).
 \end{align*}

 Vectorizing this equation to express for the complete matrix,
 \begin{equation*}
   \vec{S} = e^{\vec{\beta} \vec{f}} \left(J^T +
                 (J^T \vec{\beta}) \vec{f}^T \right).
 \end{equation*}