blob: 99192c7e06b91712e5808ccb8bc7a107c171c64e [file] [log] [blame]
% When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines.
% !TEX root = ../design.tex
\chapter[Cox Proportional-Hazards]{Cox Proportional-Hazards}
\section{Introduction}
% Abstract. What is the problem we want to solve?
Proportional-Hazard models enable comparison of \textit{survival models}.
Survival models are functions describing the probability of an one-item event
(prototypically, this event is death) with respect to time. The interval of
time before death occurs is the \textit{survival time}. Let $T$ be a random
variable representing the survival time, with a cumulative probability function
$P(t)$. Informally, $P(t)$ represents the probability that death has happened
before time $t$.
An equivalent formation is the \textit{survival function} $S(t)$, defined as
$S(t) \equiv 1 - P(t)$. Informally, this is the probability that death hasn't
happened by time $t$. The \textit{hazard function} $h(t)$ which assesses the
instantaneous risk of demise at time $t$, conditional on survival upto time $t$.
\begin{align}
h(t) &= \lim_{\Delta t \rightarrow 0} \frac{p\left(t \le T < t + \Delta t | T \ge t \right)}
{\Delta t} \\
&= \lim_{\Delta t \rightarrow 0} \frac{1}{\Delta t}\frac{p(T < t + \Delta t) - p(T < t)}{P(T \ge t)}
\end{align}
The relationship between $h(t)$ and $S(t)$, using $S(t) = 1 - p(T < t)$ is
\begin{align}
h(t) & = \lim_{\Delta t \rightarrow 0} \frac{1}{\Delta t} \frac{\left(S(t + \Delta_t) - S(t)\right)}{-S(t)}\\
& = \frac{-S'(t)}{S(t)}
\end{align}
where
$$S'(t) = \lim_{\Delta t \rightarrow 0} \frac{S(t + \Delta t) - S(t)}{\Delta t}$$
denotes the derivative of $S(t)$.
In the simplest case, the Cox model assumes that $h(t)$ is
\begin{equation}
h(t) = e^{\alpha(t)}
\end{equation}
where exp$(\alpha(t))$ is the \textit{baseline function}, which depends only on
$t$. However, in many applications, the probability of death may depend on more
than just $t$. Other covariates, such as age or weight, may be important. Let
$x_i$ denote the observed value of the $i$th covariate, then the Cox model is
written as
\begin{equation}
h(t) = e^{\alpha(t)} e^{\beta_1 x_1 + \beta_2 x_2 + \dots + \beta_m x_m} = e^{\alpha(t)} e^{\mathbf{\beta^T x}}
\end{equation}
where $\beta_i$ is the coefficient associated with the $i$th covariate.
Many applications take values from multiple observations, measuring the values
of $x_i$ for each observation. %The $j$th observation has the hazard function
%\begin{equation}
%h_j(t) = e^{\alpha(t)} e^{\beta_1 x_{j1} + \beta_2 x_{j2}+ \dots + \beta_k x_{jk}}= e^{\alpha(t)} e^{\mathbf{\beta^T x_j}}.
%\end{equation}
In the \textit{proportional-hazard model}, the hazard functions of two
observations $j$ and $k$ are compared. The ratio of the two is
\begin{equation}
\frac{h_j(t)}{h_k(t)} = \frac{e^{\alpha(t)} e^{\mathbf{\beta^T x_j}} }{e^{\alpha(t)} e^{\mathbf{\beta^T x_k}} } = \frac{e^{\mathbf{\beta^T x_j} }}{ e^{\mathbf{\beta^T x_k}}}
\end{equation}
The critical idea here is that the ratio of the two hazard functions is
completely independent of the baseline function. This allows meaningful
comparisons between samples without knowing the baseline function, which may be
difficult to measure or specify.
% SECTION: Applications
\section{Applications}\label{cox:Application}
Generally, applications start with a list of $n$ observations, each with $m$
covariates and a time of death. From this $n \times (m+1)$ matrix, we would
like to derive the correlation between the covariates and the hazard function.
This amounts to finding the values of $\beta$.
The values of $\beta$ can be estimated with the method of \textit{partial
likelihood}. This method begins by sorting the observations by time of death
into a list $[t_1, t_2, \dots, t_n]$ such that $t_i \le t_j : i < j\ \forall
i,j$. For any time $t_i$, let $R(t_i)$ be the set of observations still alive
at time $t_i$.
Given that there was a death at time $t_i$ and observation $k$ was alive prior
to $t_i$, the probability that the death happened to observation $k$ is
\begin{equation}\label{cox:equ:death-prob}
\Pr(T_k = t_i | R(t_i)) = \frac{e^{\mathbf{\beta^T x_k} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}}.
\end{equation}
The \textit{partial likelihood function} can now be generated as the product of conditional probabilities.
More formally,
\begin{equation}\label{cox:equ:likelihood}
\mathcal{L} = \prod_{i = 1}^n \left( \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right).
\end{equation}
The log-likelihood form of this equation is
\begin{equation}\label{cox:equ:LLH}
L = \sum_{i = 1}^n \left[ \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right].
\end{equation}
An estimation of $\beta$ can be found by simply maximizing this log-likelihood.
To maximize the likelihood, it helps to have the derivative of equation
\ref{cox:equ:LLH}, which is
\begin{equation}\label{cox:equ:LLH derivative}
\frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{ik} - \frac{\sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } } \right).
\end{equation}
It follows that the second derivative is
\begin{equation}\label{cox:equ:LLH second derivative}
\frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n
\left(
\frac{\left( \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } \right)
\left( \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x_j} } \right)}
{\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 } -
\frac{ \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }
{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }
\right).
\end{equation}
% Incomplete Data
\subsection{Incomplete Data}
Frequently, not every observation will have an associated time of death.
Typically, this arises when the period of observation terminates before the
entire population being studied has died. This is known as \textit{censoring}
the data. To account for this, an additional indicator variable is introduced
$\delta_i$, which is set to 1 if the $i$th observation has an associated time of
death, and 0 otherwise.
Incorporating this indicator variable into equation \ref{cox:equ:likelihood} gives
\begin{equation}\label{cox:equ:likelihood-censoring}
\mathcal{L} = \prod_{i = 1}^n \left( \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right)^{\delta_i}.
\end{equation}
The appropriate changes to the LLH function and its derivatives are trivial.
% SECTION: Implementation
\subsection{Partition and aggregation of the data to speed up}
In order to speed up the computation, we first partition the data and
aggregate each piece of the data into one big row. During the
computation, the whole big row is loaded into the memory for
processing, which speeds up the computation.
When we partition the data, we want to (1) keep the sorted descending
order of the time column, (2) make sure that each piece has
approximately the same amount of data so that the work load is even,
and (3) do this as fast as possible.
Our solution to to first sample a certain amount of the data, and then
compute the approximate break points using the sampled data. The
sampled data should be small enough to load into the memory, abd also
large enough so that the break points can be computed relatively
accurate.
After we have partitioned the data, we aggregate each partition into
one big row. The order of the data should be kept during the
aggregation.
Then we use the sequential algorithm described in the next to process
the new data row by row in the order of the time column. Since each
big row contains lots of original rows, and we deal with them all in
the memory, this can speed up the computation.
\subsection{Implementation of Newton's Method}
Newton's method is the most common choice for estimating $\beta$ by minimizing
\ref{cox:equ:likelihood} using the following update rule:
\begin{equation}
\beta_{k} = \beta_{k} - \alpha_k \left( {\nabla^2 L}^{-1} \nabla L \right)
\end{equation}
where $\alpha_k$ is a positive scalar denoting the step length in the newton
direction ${\nabla^2 L}^{-1} \nabla L$ determined using the first and second
derivative information. We would like to emphasize that the problems we are
designing this system for are those with many records and few features i.e. $n
\gg m$, thereby keeping the inverse operation on the hessian matrix relatively
cheap.
The gradient and Hessian matrix may be hard to parallelize therefore reducing an
advantage for large number of observations. To elaborate, consider equations
\ref{cox:equ:LLH derivative} and \ref{cox:equ:LLH second derivative} which are
sums with independent terms. One might think it is natural to reduce the
computational by parallelization. Efficient parallelization may be achieved if
each term could be computed independently in parallel by a set of worker tasks
and a master task could collect the output from each worker node sum them
together. However, this might not work well in this setting. To see why,
consider parallelizing equation \ref{cox:equ:LLH second derivative}. Each
worker task is given one term in the sum, which looks like
\begin{equation}
\frac{\left( \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x _j} } \right) \left( \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x _j} } \right)}{\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 } - \frac{ \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }.
\end{equation}
Note that the sums in the numerator are summing over all the data points in the
data matrix. A similar such issue is encountered while computing the first
derivative terms as defined in \ref{cox:equ:LLH derivative}. However, we note
that this sum has a structure that allows it to be computed in linear time (with
respect to the number of data points) using the following quantities.
\begin{align}
H_{i} &= \sum_{j \in R(t_i)} x_{j} e^{\mathbf{\beta^T x_j}}\\
S_{i}&= \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}} \\
V_{i}&= \sum_{j \in R(t_i)} x_{j}x_{j}^T e^{\mathbf{\beta^T x_j} }
\end{align}
Note that $H_{i}$ is a column vector with $m$ elements ( $H_{i}\in
\mathbb{R}^m$), $S_{i}$ is a scalar and and $V_{i}$ is an $m \times m$ matrix.
We can now write the first derivative of the maximum likelihood estimator,
defined in Equation \ref{cox:equ:LLH derivative} as
\begin{align}
\frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{i} - \frac{H_{i} }{ S_{i}} \right)
\end{align}
while the second derivative, defined in Equation \ref{cox:equ:LLH second
derivative}, can be reduced to
\begin{align}
\frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n \left( \frac{H_{i}H_{i}^T }{ S_{i}^2 } - \frac{V_{i}}{ S_{i} } \right)
\end{align}
Since we assume that the data points are sorted in increasing order i.e
$R(t_i) = \{i, i+1 \ldots n \}$, we can calculate the above summation as
\begin{align}
H_{i}& = H_{i+1} + x_{i} e^{\mathbf{\beta^T x_i}} \label{eq:numerator_avg}\\
S_i & = S_{i+1} + e^{\mathbf{\beta^T x_i}} \label{eq:denominator_avg}\\
V_i & = V_{i+1} + \frac{H_{i}H_{i}^T }{ S_{i}^2 } - \frac{V_{i}}{
S_{i}} \label{eq:hessian}.
\end{align}
With this recurrence relationship, the hessian matrix and the gradient direction
can be computed in linear time.
%In addition to the computational expense of computing the Hessian, the inverse
%Hessian must be computed as well. Unfortunately, matrix inversion is an
%expensive operation, and cannot be parallelized.
% SECTION: Stratification Support
\section{Stratification Support}\label{cox:stratified}
A crucial component of the Cox Proportional Hazards model is the proportional hazards assumption:
The hazard for a given individual is a fixed proportion of the hazard for any
other individual in the same stratum, and the ratio of the hazards is constant
across time.
In actual use cases, the proportional hazard assumption may not be satisfied if
we use all independent variables as covariates. A stratified Cox regression
model may then be useful. It offers a way such that we can choose a subset of
independent variables as covariates while are still taking the remaining
independent variables into account. The stratified Cox regression is available
in both R~\cite{rcox} and Stata~\cite{statacox}.
Stratification is used as shorthand for building a Cox model that allows for
more than one stratum, and hence, allows for more than one baseline hazard
function. Stratification provides two pieces of key, flexible functionality for
the end user of Cox models:
\begin{enumerate}
\item Allows a categorical variable Z to be appropriately accounted for in
the model without estimating its predictive/associated impact on the
response variable (i.e. without estimating Z's ``coefficient'').
\item Categorical variable Z is predictive/associated with the response
variable, but Z may not satisfy the proportional hazards assumption.
\end{enumerate}
To explicitly clarify how stratification differentiates from grouping support:
\begin{itemize}
\item Grouping by a categorical column would build a completely separate
Cox model for each value of the categorical column, where the baseline
hazards for each value of the categorical column would be different and
the estimated coefficient values for each explanatory variable would be
different for each value of the categorical column.
\item Stratifying by a categorical column would build a single
common Cox model, where the baseline hazards for each value of the
categorical column would be different, but the estimated coefficient values
for each explanatory variable would be the same for each value of the stratum.
\end{itemize}
It is valuable to emphasize that all strata share all coefficients, and that the
only difference between strata is the baseline hazard. In other words,
coefficients for all non-strata explanatory variables are identical across the
different strata.
\subsection{Estimating A Stratified Cox Model}\label{cox:estimate-stratified}
The parameter estimation is done by maximizing the product of likelihoods, each
from a stratum~\cite{stratifiedethzslides}.
Given $n$ observations, each with $m$ covariates and each in one of $K$
strata~\footnote{Note that this does not mean that we have $K$ variables other
than the $m$ covariates, but $K$ groups classified by strata ID variables.}, let
$\mathit{ST}_{k}$ denote the set of observations in the $k$-th stratum.
Because, as an objective function, the sum of log-likelihoods is equivalent to
the product of likelihoods, according to Equation \ref{cox:equ:LLH}, we have the
log-likelihood associated with the $k$-th stratum,
\begin{equation}\label{cox:equ:obj-each-stratum}L_{k} = \sum_{i \in \mathit{ST}_{k}} \left[ \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right],
\end{equation}
where $R_k(t_i)$ the set of observations in $\mathit{ST}_{k}$ and still alive at time $t_i$.
Therefore, the objective function of stratified cox regression can be expressed as
\begin{equation}\label{cox:equ:stratified-obj}
L^{\mathit{stratified}} = \sum_{k=1}^{K} L_{k} = \sum_{k=1}^{K} \sum_{i \in \mathit{ST}_{k}} \left[ \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right].
\end{equation}
The appropriate changes to gradient, Hessian and censoring are trivial.
\subsection{When We Need A Stratified Cox Model?}\label{cox:diagnostic}
General practice in standard statistical packages (i.e. R, SAS) is to
make use of the Schoenfeld residuals to gauge whether or not the
proportional hazards assumption has been violated.
% To this end, we need to investigate the implementation of R's
% \texttt{cox.zph()} function for MADlib.
% There are several parametric options available in the
% \texttt{cox.zph()} function, but we prioritize implementation of
% \texttt{cox.zph()} with its default parameter settings.
% The output of \texttt{cox.zph()} contains 3 numbers for each
% covariate, scaled Schoenfeld residual, chi-square, and p-value.
% The actual formulas of computing these are still to be investigated.
The Schoenfeld residuals are centered on zero and should be
independent of time if PHA (proportional hazard assumption) is
true. Deviations from this, i.e. residuals that exhibit some trend in
time, indicate that the PHA is violated.
The Schoenfeld residuals, at the time when a failure or death were to occur, are
defined by the difference between the observed and expected covariate values at
that time. Also note that the Schoenfeld residuals are zeroes for censored
observations.
\begin{equation}
\hat{\vec{r}}_i = \vec{x}_i - E[\vec{x}_i]\ ,\mbox{ only for
}\delta_i=1\ ,
\end{equation}
To compute the expected values at time $t_{i}$, we use the probability distribution given by Equation \ref{cox:equ:death-prob}.
\begin{eqnarray}
E[\vec{x}_i] &=& \sum_{k\in \mathcal{R}(t_i)}\vec{x}_k p(k\mbox{ dies at
}t_i) \nonumber\\
&=& \frac{\sum_{k\in \mathcal{R}(t_i)}\vec{x}_k
e^{\vec{\beta}\vec{x}_k}}{\sum_{j\in
\mathcal{R}(t_i)}e^{\vec{\beta}\vec{x}_j} },
\end{eqnarray}
where the values of $\vec{\beta}$ are the fitted coefficients, and $\mathcal{R}(t)$ is the set of individuals that are still alive at time $t$.
\subsubsection{Scaled Schoenfeld Residuals}\label{cox:scaled-residual}
Suggested by Grambsch and Therneau \cite{grambsch1994proportional} and also
followed by statistical softwares \cite{testph}, scaling the Schoenfeld
residuals by an estimator of its variance yields greater diagnostic power. The
scaled Schoenfeld residuals is
\begin{eqnarray}
\hat{\vec{r}}^{*}_i &= [\mathit{Var}(\hat{\vec{r}}_i)]^{-1} \hat{\vec{r}}_i\\
&\approx m \mathit{Var}(\hat{\vec{\beta}}) \hat{\vec{r}}_i ,
\end{eqnarray}
where $\mathit{Var}$ denotes a covariance matrix, and $m$ is the
number of uncensored survival times. $\hat{\vec{r}}_i$ is a length-$n$
vector, and $\mathit{Var}(\hat{\vec{\beta}})$ is a $n\times n$
matrix, where $m$ is the number of uncensored data points and $n$ is
the number of features.
\subsubsection{Transformation of Time}\label{cox:transform}
Transformation of the time values often helps the analysis of the correlation
between the scaled Schoenfeld residuals and time. Common transformation methods
include ranking, computing log, and Kaplan-Meier's method. (We don't fully
understand the last one yet.)
\footnote{The \texttt{cox.zph()} function in R allows different options for
transformation of times: \texttt{``km''}, \texttt{``rank''},
\texttt{``log''} and \texttt{``identity''}.
\texttt{``km''} stands for Kaplan-Meier's method.
\texttt{``rank''} takes the ranks of the times instead of times.
\texttt{``log''} uses the logarithm of times.
And \texttt{``identity''} does nothing and directly uses times.\cite{coxzph}}
%To quantify whether there is a trend in the Schoenfeld residuals over
%time, we can compute the correlation between the scaled residuals and the
%ranks of the times, where the scaled residuals are given by
%\begin{equation}
%\hat{\vec{r}}_i = m \widehat{\mbox{Var}}(\vec{\beta}) \vec{r}_i\ ,
%\end{equation}
%where $m$ is the number of uncensored survival times, and
%$\widehat{\mbox{Var}}(\vec{\beta})$ is the estimated variance matrix for
%the fitted coefficients.
\subsubsection{$p$-values}
The process for computing the $p$-values of the correlation in R's
\texttt{survival} package is given
in the following.
Let $m$ be the number of uncensored data points, $n$ be the number of
features, $\hat{\vec{t}}$ be the transformed survival time, which is a
length-$m$ vector,
$\mathit{Var}(\hat{\vec{\beta}})$ be the variance matrix for the
fitted coefficients which is a $n\times n$ matrix, $\hat{\vec{R}}$
is the unscaled Schoenfeld residuals, which is a $m\times n$ matrix.
\begin{eqnarray}
\vec{w} &=& \hat{\vec{t}} - \bar{\hat{t}}\ , \mbox{ a length-}m \mbox{
vector}\ ,\\
\vec{v} &=& m\,\vec{w}^T\hat{\vec{R}}\mathit{Var}(\hat{\vec{\beta}})\ , \mbox{
a length-}n\mbox{ vector}\\
z_i &=& \frac{1}{m\,\vec{w}^T\vec{w}}
\cdot\frac{v^2_i}{\left[\mathit{Var}(\hat{\vec{\beta}})\right]_{ii}}\
, \mbox{ for }i=1,\dots,n\ ,\\
p_i &=& 1 - \chi_1^2(z_i), \mbox{ for }i=1,\dots,n\ .
\end{eqnarray}
Here $\bar{\hat{t}}$ is the average of the transformed survival
times. $z_i$ is the z-stats for $i$-th coefficient. $p_i$ is the
p-value for $i$-th coefficient. We need a separate function to compute
$\vec{w}$, but both $\vec{v}$ and $\vec{w}^T\vec{w}$ can be computed
in an aggregate function. $z_i$ and $p_i$ can be computed in the final
function of that aggregate function. $\chi^2_1(\cdot)$ is the
chi-square function with the degree of freedom being $1$.
\subsection{Implementation}
We can use the iteration equations Eqs. (\ref{eq:numerator_avg},
\ref{eq:denominator_avg}, \ref{eq:hessian}) to compute the residuals and the
hessian, which is needed by the variance matrix.
The current implementation uses an ordered aggregate on the data to
compute these quantities, which is not in parallel. To enable a distributed
solution we can use the ``GROUP BY'' functionality provided by SQL to enable the
independent computation of the log-likelihood function in each distributed segment
corresponding to each strata (`transition' function). These values can then be added
across the segments (`merge' function), with the gradient for the parameter computed
on the final sum (`final' function).
\subsection{Resolving Ties}
In ``coxph\_train'', Breslow method is used to resolve ties, which uses
as the partial likelihood~\cite{hosmer2011applied}
\begin{eqnarray}
L &=& \sum_{i=1}^{m}\left[ \vec{\beta}^{T} \vec{x}_{i+} -
d_i \log\left( \sum_{j \in R(t_i)} e^{\vec{\beta}^T
\vec{x}_j}\right) \right]\\
&=& \sum_{i=1}^{m}\left[ \vec{\beta}^{T} \vec{x}_i - \log\left( \sum_{j \in R(t_i)} e^{\vec{\beta}^T
\vec{x}_j}\right)\right]_{+}
\end{eqnarray}
where $d_i$ is the number of observations that have the same $t_i$,
and $x_{i+}$ is the sum of the covariants over all observations that
have the same $t_i$. $m$ is the number of distinct survival
times. Here $[\cdot]_{+}$ means sum over all observations with the
same survival time.
\subsection{Robust Variance Estimators}
In MADlib, we implement robust variance estimator devised by
Lin and Wei~\cite{lin1989robust}.
With our notations above, let
\begin{eqnarray*}
\vec{W}_{i}
&=& \delta_i \cdot \left[ \vec{x_i} - \frac{\left( \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} \vec{x_l} \right)}{\sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}}} \right]
- \sum_{j: t_j \le t_i} \left\{ \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{\sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}}}
\cdot \left[ \vec{x_i} - \frac{\left( \sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}} \vec{x_k} \right)}{\sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}}} \right] \right\}.\\
&=& \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right]
- \sum_{j: t_j \le t_i} \left\{ \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j}
\cdot \left[ \vec{x_i} - \frac{\vec{H}_j}{S_j} \right] \right\}.\\
&=& \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right]
- \sum_{j: t_j \le t_i} \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} \vec{x_i}
+ \sum_{j: t_j \le t_i} \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} \frac{\vec{H}_j}{S_j}.\\
\end{eqnarray*}
where
\begin{eqnarray*}
H_i &=& \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} \vec{x_l}\\
S_i &=& \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}}
\end{eqnarray*}
Let
\begin{eqnarray*}
A_i &=& \sum_{j: t_j \le t_i} \frac{\delta_j}{S_j},\\
\vec{B}_i &=& \sum_{j: t_j \le t_i} \frac{\delta_j \vec{H}_j}{S_j^2},\\
\end{eqnarray*}
we have
\[
\vec{W}_i = \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right] - e^{\vec{\beta}^T \vec{x_i}} A_i \vec{x_i} + e^{\vec{\beta}^T \vec{x_i}} \vec{B}_i.
\]
And the recursions of $A_i$ and $\vec{B}_i$ are given if we assume time $t_i$ is
sorted ascending order
\begin{eqnarray*}
A_i &=& A_{i-1} + \frac{\delta_i}{S_i}\\
\vec{B}_i &=& \vec{B}_{i-1} + \frac{\delta_i \vec{H}_i}{S_i^2}
\end{eqnarray*}
The meat part of the sandwich estimator is
\begin{equation}
\mathbb{M} = \sum_{i=1}^n\vec{W}_i\vec{W}_i^T\ .
\end{equation}
\subsection{Clustered Variance Estimation}
The data has multiple clusters $m=1,\dots,K$, and the meat part is
\begin{equation}
\mathbb{M} = \mathbb{A}^T\mathbb{A}\ ,
\end{equation}
where the matrix $\mathbb{A}$'s $m$-th row is given by
\begin{equation}
\mathbb{A}_m = \sum_{i\in G_m} \vec{W}_i\ .
\end{equation}
Here $G_m$ is the set of rows of that belong to the $m$-th cluster.
With stratification, we take into account of the strata only when we
compute $\vec{H}_i$, $S_i$, $A_i$, $\vec{B}_i$ and $\vec{W}_i$. The
calculations of $\mathbb{A}_m$ and $\mathbb{M}$ only need to group by
the clustered variables and the strata variables are irrelavent.
\section{How to prevent under/over -flow errors ?}
A problem that is not mentioned above but appears in the real
applications of the CoxPH training model is the under-flow or
over-flow errors. We have exponential functions in the computation,
and it is very easy to get under-flow or over-flow errors if the
coefficients become too small or large at certain steps.
We use the same method as R's "survival" package to deal with the
possible under-flow/over-flow errors. This methods contains three
parts, which makes the algorithm described above even more
complicated:
(1) center and scale the independent variables.
\begin{equation}
x_i \rightarrow \frac{x_i - E[x]}{E[\vert x_i - E[x] \vert]}
\end{equation}
(2) Estimate the maximum possible value of all the coefficients using
the coefficients computed from the first iteration.
\begin{equation}
\beta_k^{(max)} = 20 \, \sqrt{\frac{h_{kk}}{\sum_i \delta_i}}\ ,
\end{equation}
where $\beta_k^{(max)}$ is the estimate of the possible maximum value
of the coefficient $\beta_k$, $h_{kk} = \partial^2 L / \partial
\beta_k^2$ is the diagonal elements of the Hessian matrix, and
$\delta_i=0,1$ is the censoring status of the records.
During the computation, whenever the coefficient $\vert \beta_k \vert
> \beta_k^{(max)}$, we set $\beta_k \rightarrow \mbox{sign}(\beta_k)
\beta_k^{(max)}$.
The authors of "survival" package explains in
http://stat.ethz.ch/R-manual/R-devel/RHOME/library/survival/doc/sourcecode.pdf
why they use such an estimate for the coefficients:
"We use a cutpoint of $\beta * \mbox{std}(x) < 23$
where the standard deviation is the average standard deviation of $x$ within a risk
set. The rationale is that $e^{23}$ is greater than the current world
population, so such a coefficient corresponds to a between subject
relative risk that is larger than any imaginable."
In their implementation, they also use $1/h_{kk}$ as an approximation
to $\mbox{std}(x)$. And besides $23$, they also use $20$ in the
estimate.
(3) Although (1) and (2) stabalize the computation, it is still not
enough. Step-halving method is used. Whenever the current iteration's
log-likelihood is smaller than that of previous iteration, we accept
the coefficients as
\begin{equation}
\beta_k = \frac{1}{2}(\beta_k^{new} + \beta_k^{old})
\end{equation}
(4) The stopping threshold is
\begin{equation}
1 - \frac{L_{new}}{L_{old}} < \mbox{threshold}\ .
\end{equation}
\section{Marginal Effects}
See \ref{sub:marginal_effects} for an introduction to marginal effects
(all notations below are the same as those defined in \ref{sub:marginal_effects}).
We implement the default algorithm used by Stata 13.1 for computing the
marginal effects. Note that older versions of Stata may use a
different default algorithm to compute values different from MADlib.
\subsection{Basic Formulation}
The relative hazard ratio for $i$-th record is given by
\begin{equation}
h_i = h_0 \exp \left( \vec{\beta}\cdot\vec{f} \right)\ ,
\end{equation}
where $h_0$ is the baseline and both $\vec{\beta}$ and
$\vec{f}$ are vectors. Here we use the indices
$i$ or $j$ for the data records, and $a$ or $b$ for the indices of
covariate terms in $\vec{\beta}\cdot \vec{f}(\vec{x}_i)$. And we will use
$u$ or $v$ to denote the indices of $\vec{x}$.
The value of the baseline $h_0$ is arbitrary and difficult to compute. Strata
ignores the baseline hazard value for the computation of marginal effect. For
MADlib, we follow the same principle and ignore the baseline (i.e. set baseline
as 1) to compute hazard value.
Thus the marginal effect corresponding to variable $x_k$ is computed as,
\begin{align*} \label{eq:me1}
\mathit{ME}_{k} & = \pder[h_i]{x_k} \\
& = \pder[e^{\vec{\beta} \vec{f}}]{x_k} \\
& = e^{\vec{\beta} \vec{f}} \vec{\beta} \pder[f]{x_k}.
\end{align*}
Vectorizing the above equation (similar to \ref{ssub:logistic_regression}) gives
\begin{equation*}
\mathit{ME} = e^{\vec{\beta} \vec{f}} J^T \vec{\beta},
\end{equation*}
where $J$ is defined in \ref{sub:marginal_effects_for_regression_methods}.
Censoring status and stratification are irrelavant to the marginal effect
calculation and can be ignored.
\subsection{Categorical variables}
For categorical variables, we compute the discrete difference as described in
\ref{sub:categorical_variables}.
The discrete difference with respect to $x_k$ is given as
\begin{align*}
\mathit{ME_k} &= h^{set} - h^{unset} \\
&= e^{\vec{\beta}\vec{f^{set}}} - e^{\vec{\beta}\vec{f^{unset}}}
\end{align*}
\subsection{Standard Error}
As has already been described in \ref{sub:standard_errors}, the method to compute
the standard errors is
\begin{equation}
\mbox{Var}(ME) = \mathbb{S}\, \mbox{Var}(\vec{\beta})\,
\mathbb{S}^T,
\end{equation}
where the matrix $\mathbb{S}$, computed as the partial derivative of
the marginal effect over all the coefficients, is a $M\times N$ matrix $S_{mn}
= \pder[\mathit{ME}_m]{\beta_n}$
\begin{align*}
S_{mn} & = \pder[\mathit{ME}_m]{\beta_n} \\
& = \pder{\beta_n} \left( e^{\vec{\beta} \vec{f}} \vec{\beta} \pder[\vec{f}]{x_m} \right)\\
& = e^{\vec{\beta} \vec{f}} \pder{\beta_n}\left(\vec{\beta} \pder[\vec{f}]{x_m} \right) +
\pder[e^{\vec{\beta} \vec{f}}]{\beta_n} \vec{\beta} \pder[\vec{f}]{x_m} \\
& = e^{\vec{\beta} \vec{f}} \pder[f_n]{x_m} +
e^{\vec{\beta} \vec{f}} \cdot f_n \cdot \vec{\beta} \pder[\vec{f}]{x_m} \\
& = e^{\vec{\beta} \vec{f}} \left( \pder[f_n]{x_m} +
f_n \cdot \vec{\beta} \pder[\vec{f}]{x_m} \right).
\end{align*}
Vectorizing this equation to express for the complete matrix,
\begin{equation*}
\vec{S} = e^{\vec{\beta} \vec{f}} \left(J^T +
(J^T \vec{\beta}) \vec{f}^T \right).
\end{equation*}