| % When using TeXShop on the Mac, let it know the root document. The following must be one of the first 20 lines. |
| % !TEX root = ../design.tex |
| |
| \chapter[Cox Proportional-Hazards]{Cox Proportional-Hazards} |
| |
| \section{Introduction} |
| % Abstract. What is the problem we want to solve? |
| Proportional-Hazard models enable comparison of \textit{survival models}. |
| Survival models are functions describing the probability of an one-item event |
| (prototypically, this event is death) with respect to time. The interval of |
| time before death occurs is the \textit{survival time}. Let $T$ be a random |
| variable representing the survival time, with a cumulative probability function |
| $P(t)$. Informally, $P(t)$ represents the probability that death has happened |
| before time $t$. |
| |
| An equivalent formation is the \textit{survival function} $S(t)$, defined as |
| $S(t) \equiv 1 - P(t)$. Informally, this is the probability that death hasn't |
| happened by time $t$. The \textit{hazard function} $h(t)$ which assesses the |
| instantaneous risk of demise at time $t$, conditional on survival upto time $t$. |
| \begin{align} |
| h(t) &= \lim_{\Delta t \rightarrow 0} \frac{p\left(t \le T < t + \Delta t | T \ge t \right)} |
| {\Delta t} \\ |
| &= \lim_{\Delta t \rightarrow 0} \frac{1}{\Delta t}\frac{p(T < t + \Delta t) - p(T < t)}{P(T \ge t)} |
| \end{align} |
| |
| The relationship between $h(t)$ and $S(t)$, using $S(t) = 1 - p(T < t)$ is |
| \begin{align} |
| h(t) & = \lim_{\Delta t \rightarrow 0} \frac{1}{\Delta t} \frac{\left(S(t + \Delta_t) - S(t)\right)}{-S(t)}\\ |
| & = \frac{-S'(t)}{S(t)} |
| \end{align} |
| where |
| $$S'(t) = \lim_{\Delta t \rightarrow 0} \frac{S(t + \Delta t) - S(t)}{\Delta t}$$ |
| denotes the derivative of $S(t)$. |
| |
| In the simplest case, the Cox model assumes that $h(t)$ is |
| \begin{equation} |
| h(t) = e^{\alpha(t)} |
| \end{equation} |
| |
| where exp$(\alpha(t))$ is the \textit{baseline function}, which depends only on |
| $t$. However, in many applications, the probability of death may depend on more |
| than just $t$. Other covariates, such as age or weight, may be important. Let |
| $x_i$ denote the observed value of the $i$th covariate, then the Cox model is |
| written as |
| |
| \begin{equation} |
| h(t) = e^{\alpha(t)} e^{\beta_1 x_1 + \beta_2 x_2 + \dots + \beta_m x_m} = e^{\alpha(t)} e^{\mathbf{\beta^T x}} |
| \end{equation} |
| where $\beta_i$ is the coefficient associated with the $i$th covariate. |
| |
| Many applications take values from multiple observations, measuring the values |
| of $x_i$ for each observation. %The $j$th observation has the hazard function |
| |
| %\begin{equation} |
| %h_j(t) = e^{\alpha(t)} e^{\beta_1 x_{j1} + \beta_2 x_{j2}+ \dots + \beta_k x_{jk}}= e^{\alpha(t)} e^{\mathbf{\beta^T x_j}}. |
| %\end{equation} |
| |
| In the \textit{proportional-hazard model}, the hazard functions of two |
| observations $j$ and $k$ are compared. The ratio of the two is |
| \begin{equation} |
| \frac{h_j(t)}{h_k(t)} = \frac{e^{\alpha(t)} e^{\mathbf{\beta^T x_j}} }{e^{\alpha(t)} e^{\mathbf{\beta^T x_k}} } = \frac{e^{\mathbf{\beta^T x_j} }}{ e^{\mathbf{\beta^T x_k}}} |
| \end{equation} |
| |
| The critical idea here is that the ratio of the two hazard functions is |
| completely independent of the baseline function. This allows meaningful |
| comparisons between samples without knowing the baseline function, which may be |
| difficult to measure or specify. |
| |
| % SECTION: Applications |
| \section{Applications}\label{cox:Application} |
| Generally, applications start with a list of $n$ observations, each with $m$ |
| covariates and a time of death. From this $n \times (m+1)$ matrix, we would |
| like to derive the correlation between the covariates and the hazard function. |
| This amounts to finding the values of $\beta$. |
| |
| The values of $\beta$ can be estimated with the method of \textit{partial |
| likelihood}. This method begins by sorting the observations by time of death |
| into a list $[t_1, t_2, \dots, t_n]$ such that $t_i \le t_j : i < j\ \forall |
| i,j$. For any time $t_i$, let $R(t_i)$ be the set of observations still alive |
| at time $t_i$. |
| |
| Given that there was a death at time $t_i$ and observation $k$ was alive prior |
| to $t_i$, the probability that the death happened to observation $k$ is |
| \begin{equation}\label{cox:equ:death-prob} |
| \Pr(T_k = t_i | R(t_i)) = \frac{e^{\mathbf{\beta^T x_k} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}}. |
| \end{equation} |
| |
| The \textit{partial likelihood function} can now be generated as the product of conditional probabilities. |
| More formally, |
| \begin{equation}\label{cox:equ:likelihood} |
| \mathcal{L} = \prod_{i = 1}^n \left( \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right). |
| \end{equation} |
| |
| The log-likelihood form of this equation is |
| \begin{equation}\label{cox:equ:LLH} |
| L = \sum_{i = 1}^n \left[ \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right]. |
| \end{equation} |
| |
| An estimation of $\beta$ can be found by simply maximizing this log-likelihood. |
| To maximize the likelihood, it helps to have the derivative of equation |
| \ref{cox:equ:LLH}, which is |
| \begin{equation}\label{cox:equ:LLH derivative} |
| \frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{ik} - \frac{\sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } } \right). |
| \end{equation} |
| It follows that the second derivative is |
| \begin{equation}\label{cox:equ:LLH second derivative} |
| \frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n |
| \left( |
| \frac{\left( \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x_j} } \right) |
| \left( \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x_j} } \right)} |
| {\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 } - |
| \frac{ \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } } |
| {\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } } |
| \right). |
| \end{equation} |
| |
| % Incomplete Data |
| \subsection{Incomplete Data} |
| Frequently, not every observation will have an associated time of death. |
| Typically, this arises when the period of observation terminates before the |
| entire population being studied has died. This is known as \textit{censoring} |
| the data. To account for this, an additional indicator variable is introduced |
| $\delta_i$, which is set to 1 if the $i$th observation has an associated time of |
| death, and 0 otherwise. |
| |
| Incorporating this indicator variable into equation \ref{cox:equ:likelihood} gives |
| \begin{equation}\label{cox:equ:likelihood-censoring} |
| \mathcal{L} = \prod_{i = 1}^n \left( \frac{e^{\mathbf{\beta^T x_i} }}{ \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}}} \right)^{\delta_i}. |
| \end{equation} |
| The appropriate changes to the LLH function and its derivatives are trivial. |
| |
| |
| % SECTION: Implementation |
| \subsection{Partition and aggregation of the data to speed up} |
| |
| In order to speed up the computation, we first partition the data and |
| aggregate each piece of the data into one big row. During the |
| computation, the whole big row is loaded into the memory for |
| processing, which speeds up the computation. |
| |
| When we partition the data, we want to (1) keep the sorted descending |
| order of the time column, (2) make sure that each piece has |
| approximately the same amount of data so that the work load is even, |
| and (3) do this as fast as possible. |
| |
| Our solution to to first sample a certain amount of the data, and then |
| compute the approximate break points using the sampled data. The |
| sampled data should be small enough to load into the memory, abd also |
| large enough so that the break points can be computed relatively |
| accurate. |
| |
| After we have partitioned the data, we aggregate each partition into |
| one big row. The order of the data should be kept during the |
| aggregation. |
| |
| Then we use the sequential algorithm described in the next to process |
| the new data row by row in the order of the time column. Since each |
| big row contains lots of original rows, and we deal with them all in |
| the memory, this can speed up the computation. |
| |
| |
| \subsection{Implementation of Newton's Method} |
| Newton's method is the most common choice for estimating $\beta$ by minimizing |
| \ref{cox:equ:likelihood} using the following update rule: |
| \begin{equation} |
| \beta_{k} = \beta_{k} - \alpha_k \left( {\nabla^2 L}^{-1} \nabla L \right) |
| \end{equation} |
| where $\alpha_k$ is a positive scalar denoting the step length in the newton |
| direction ${\nabla^2 L}^{-1} \nabla L$ determined using the first and second |
| derivative information. We would like to emphasize that the problems we are |
| designing this system for are those with many records and few features i.e. $n |
| \gg m$, thereby keeping the inverse operation on the hessian matrix relatively |
| cheap. |
| |
| The gradient and Hessian matrix may be hard to parallelize therefore reducing an |
| advantage for large number of observations. To elaborate, consider equations |
| \ref{cox:equ:LLH derivative} and \ref{cox:equ:LLH second derivative} which are |
| sums with independent terms. One might think it is natural to reduce the |
| computational by parallelization. Efficient parallelization may be achieved if |
| each term could be computed independently in parallel by a set of worker tasks |
| and a master task could collect the output from each worker node sum them |
| together. However, this might not work well in this setting. To see why, |
| consider parallelizing equation \ref{cox:equ:LLH second derivative}. Each |
| worker task is given one term in the sum, which looks like |
| \begin{equation} |
| \frac{\left( \sum_{j \in R(t_i)} x_{jk} e^{\mathbf{\beta^T x _j} } \right) \left( \sum_{j \in R(t_i)} x_{jl} e^{\mathbf{\beta^T x _j} } \right)}{\left( \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } \right)^2 } - \frac{ \sum_{j \in R(t_i)} x_{jk}x_{jl} e^{\mathbf{\beta^T x_j} } }{\sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j} } }. |
| \end{equation} |
| |
| Note that the sums in the numerator are summing over all the data points in the |
| data matrix. A similar such issue is encountered while computing the first |
| derivative terms as defined in \ref{cox:equ:LLH derivative}. However, we note |
| that this sum has a structure that allows it to be computed in linear time (with |
| respect to the number of data points) using the following quantities. |
| \begin{align} |
| H_{i} &= \sum_{j \in R(t_i)} x_{j} e^{\mathbf{\beta^T x_j}}\\ |
| S_{i}&= \sum_{j \in R(t_i)} e^{\mathbf{\beta^T x_j}} \\ |
| V_{i}&= \sum_{j \in R(t_i)} x_{j}x_{j}^T e^{\mathbf{\beta^T x_j} } |
| \end{align} |
| |
| Note that $H_{i}$ is a column vector with $m$ elements ( $H_{i}\in |
| \mathbb{R}^m$), $S_{i}$ is a scalar and and $V_{i}$ is an $m \times m$ matrix. |
| We can now write the first derivative of the maximum likelihood estimator, |
| defined in Equation \ref{cox:equ:LLH derivative} as |
| \begin{align} |
| \frac{\partial L}{\partial \beta_k} = \sum_{i = 1}^n \left( x_{i} - \frac{H_{i} }{ S_{i}} \right) |
| \end{align} |
| while the second derivative, defined in Equation \ref{cox:equ:LLH second |
| derivative}, can be reduced to |
| |
| \begin{align} |
| \frac{\partial^2 L}{\partial \beta_k \beta_l} = \sum_{i = 1}^n \left( \frac{H_{i}H_{i}^T }{ S_{i}^2 } - \frac{V_{i}}{ S_{i} } \right) |
| \end{align} |
| Since we assume that the data points are sorted in increasing order i.e |
| $R(t_i) = \{i, i+1 \ldots n \}$, we can calculate the above summation as |
| \begin{align} |
| H_{i}& = H_{i+1} + x_{i} e^{\mathbf{\beta^T x_i}} \label{eq:numerator_avg}\\ |
| S_i & = S_{i+1} + e^{\mathbf{\beta^T x_i}} \label{eq:denominator_avg}\\ |
| V_i & = V_{i+1} + \frac{H_{i}H_{i}^T }{ S_{i}^2 } - \frac{V_{i}}{ |
| S_{i}} \label{eq:hessian}. |
| \end{align} |
| With this recurrence relationship, the hessian matrix and the gradient direction |
| can be computed in linear time. |
| |
| %In addition to the computational expense of computing the Hessian, the inverse |
| %Hessian must be computed as well. Unfortunately, matrix inversion is an |
| %expensive operation, and cannot be parallelized. |
| |
| % SECTION: Stratification Support |
| \section{Stratification Support}\label{cox:stratified} |
| A crucial component of the Cox Proportional Hazards model is the proportional hazards assumption: |
| The hazard for a given individual is a fixed proportion of the hazard for any |
| other individual in the same stratum, and the ratio of the hazards is constant |
| across time. |
| |
| In actual use cases, the proportional hazard assumption may not be satisfied if |
| we use all independent variables as covariates. A stratified Cox regression |
| model may then be useful. It offers a way such that we can choose a subset of |
| independent variables as covariates while are still taking the remaining |
| independent variables into account. The stratified Cox regression is available |
| in both R~\cite{rcox} and Stata~\cite{statacox}. |
| |
| Stratification is used as shorthand for building a Cox model that allows for |
| more than one stratum, and hence, allows for more than one baseline hazard |
| function. Stratification provides two pieces of key, flexible functionality for |
| the end user of Cox models: |
| \begin{enumerate} |
| \item Allows a categorical variable Z to be appropriately accounted for in |
| the model without estimating its predictive/associated impact on the |
| response variable (i.e. without estimating Z's ``coefficient''). |
| \item Categorical variable Z is predictive/associated with the response |
| variable, but Z may not satisfy the proportional hazards assumption. |
| \end{enumerate} |
| |
| To explicitly clarify how stratification differentiates from grouping support: |
| \begin{itemize} |
| \item Grouping by a categorical column would build a completely separate |
| Cox model for each value of the categorical column, where the baseline |
| hazards for each value of the categorical column would be different and |
| the estimated coefficient values for each explanatory variable would be |
| different for each value of the categorical column. |
| \item Stratifying by a categorical column would build a single |
| common Cox model, where the baseline hazards for each value of the |
| categorical column would be different, but the estimated coefficient values |
| for each explanatory variable would be the same for each value of the stratum. |
| \end{itemize} |
| |
| It is valuable to emphasize that all strata share all coefficients, and that the |
| only difference between strata is the baseline hazard. In other words, |
| coefficients for all non-strata explanatory variables are identical across the |
| different strata. |
| |
| \subsection{Estimating A Stratified Cox Model}\label{cox:estimate-stratified} |
| The parameter estimation is done by maximizing the product of likelihoods, each |
| from a stratum~\cite{stratifiedethzslides}. |
| |
| Given $n$ observations, each with $m$ covariates and each in one of $K$ |
| strata~\footnote{Note that this does not mean that we have $K$ variables other |
| than the $m$ covariates, but $K$ groups classified by strata ID variables.}, let |
| $\mathit{ST}_{k}$ denote the set of observations in the $k$-th stratum. |
| |
| Because, as an objective function, the sum of log-likelihoods is equivalent to |
| the product of likelihoods, according to Equation \ref{cox:equ:LLH}, we have the |
| log-likelihood associated with the $k$-th stratum, |
| \begin{equation}\label{cox:equ:obj-each-stratum}L_{k} = \sum_{i \in \mathit{ST}_{k}} \left[ \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right], |
| \end{equation} |
| where $R_k(t_i)$ the set of observations in $\mathit{ST}_{k}$ and still alive at time $t_i$. |
| |
| Therefore, the objective function of stratified cox regression can be expressed as |
| \begin{equation}\label{cox:equ:stratified-obj} |
| L^{\mathit{stratified}} = \sum_{k=1}^{K} L_{k} = \sum_{k=1}^{K} \sum_{i \in \mathit{ST}_{k}} \left[ \mathbf{\beta^T x_i} - \log\left(\sum_{j \in R_k(t_i)} e^{\mathbf{\beta^T x_j} } \right) \right]. |
| \end{equation} |
| The appropriate changes to gradient, Hessian and censoring are trivial. |
| |
| \subsection{When We Need A Stratified Cox Model?}\label{cox:diagnostic} |
| General practice in standard statistical packages (i.e. R, SAS) is to |
| make use of the Schoenfeld residuals to gauge whether or not the |
| proportional hazards assumption has been violated. |
| |
| % To this end, we need to investigate the implementation of R's |
| % \texttt{cox.zph()} function for MADlib. |
| % There are several parametric options available in the |
| % \texttt{cox.zph()} function, but we prioritize implementation of |
| % \texttt{cox.zph()} with its default parameter settings. |
| % The output of \texttt{cox.zph()} contains 3 numbers for each |
| % covariate, scaled Schoenfeld residual, chi-square, and p-value. |
| % The actual formulas of computing these are still to be investigated. |
| |
| The Schoenfeld residuals are centered on zero and should be |
| independent of time if PHA (proportional hazard assumption) is |
| true. Deviations from this, i.e. residuals that exhibit some trend in |
| time, indicate that the PHA is violated. |
| |
| The Schoenfeld residuals, at the time when a failure or death were to occur, are |
| defined by the difference between the observed and expected covariate values at |
| that time. Also note that the Schoenfeld residuals are zeroes for censored |
| observations. |
| \begin{equation} |
| \hat{\vec{r}}_i = \vec{x}_i - E[\vec{x}_i]\ ,\mbox{ only for |
| }\delta_i=1\ , |
| \end{equation} |
| |
| To compute the expected values at time $t_{i}$, we use the probability distribution given by Equation \ref{cox:equ:death-prob}. |
| \begin{eqnarray} |
| E[\vec{x}_i] &=& \sum_{k\in \mathcal{R}(t_i)}\vec{x}_k p(k\mbox{ dies at |
| }t_i) \nonumber\\ |
| &=& \frac{\sum_{k\in \mathcal{R}(t_i)}\vec{x}_k |
| e^{\vec{\beta}\vec{x}_k}}{\sum_{j\in |
| \mathcal{R}(t_i)}e^{\vec{\beta}\vec{x}_j} }, |
| \end{eqnarray} |
| where the values of $\vec{\beta}$ are the fitted coefficients, and $\mathcal{R}(t)$ is the set of individuals that are still alive at time $t$. |
| |
| \subsubsection{Scaled Schoenfeld Residuals}\label{cox:scaled-residual} |
| Suggested by Grambsch and Therneau \cite{grambsch1994proportional} and also |
| followed by statistical softwares \cite{testph}, scaling the Schoenfeld |
| residuals by an estimator of its variance yields greater diagnostic power. The |
| scaled Schoenfeld residuals is |
| \begin{eqnarray} |
| \hat{\vec{r}}^{*}_i &= [\mathit{Var}(\hat{\vec{r}}_i)]^{-1} \hat{\vec{r}}_i\\ |
| &\approx m \mathit{Var}(\hat{\vec{\beta}}) \hat{\vec{r}}_i , |
| \end{eqnarray} |
| where $\mathit{Var}$ denotes a covariance matrix, and $m$ is the |
| number of uncensored survival times. $\hat{\vec{r}}_i$ is a length-$n$ |
| vector, and $\mathit{Var}(\hat{\vec{\beta}})$ is a $n\times n$ |
| matrix, where $m$ is the number of uncensored data points and $n$ is |
| the number of features. |
| |
| \subsubsection{Transformation of Time}\label{cox:transform} |
| Transformation of the time values often helps the analysis of the correlation |
| between the scaled Schoenfeld residuals and time. Common transformation methods |
| include ranking, computing log, and Kaplan-Meier's method. (We don't fully |
| understand the last one yet.) |
| |
| \footnote{The \texttt{cox.zph()} function in R allows different options for |
| transformation of times: \texttt{``km''}, \texttt{``rank''}, |
| \texttt{``log''} and \texttt{``identity''}. |
| \texttt{``km''} stands for Kaplan-Meier's method. |
| \texttt{``rank''} takes the ranks of the times instead of times. |
| \texttt{``log''} uses the logarithm of times. |
| And \texttt{``identity''} does nothing and directly uses times.\cite{coxzph}} |
| |
| %To quantify whether there is a trend in the Schoenfeld residuals over |
| %time, we can compute the correlation between the scaled residuals and the |
| %ranks of the times, where the scaled residuals are given by |
| %\begin{equation} |
| %\hat{\vec{r}}_i = m \widehat{\mbox{Var}}(\vec{\beta}) \vec{r}_i\ , |
| %\end{equation} |
| %where $m$ is the number of uncensored survival times, and |
| %$\widehat{\mbox{Var}}(\vec{\beta})$ is the estimated variance matrix for |
| %the fitted coefficients. |
| |
| \subsubsection{$p$-values} |
| The process for computing the $p$-values of the correlation in R's |
| \texttt{survival} package is given |
| in the following. |
| |
| Let $m$ be the number of uncensored data points, $n$ be the number of |
| features, $\hat{\vec{t}}$ be the transformed survival time, which is a |
| length-$m$ vector, |
| $\mathit{Var}(\hat{\vec{\beta}})$ be the variance matrix for the |
| fitted coefficients which is a $n\times n$ matrix, $\hat{\vec{R}}$ |
| is the unscaled Schoenfeld residuals, which is a $m\times n$ matrix. |
| \begin{eqnarray} |
| \vec{w} &=& \hat{\vec{t}} - \bar{\hat{t}}\ , \mbox{ a length-}m \mbox{ |
| vector}\ ,\\ |
| \vec{v} &=& m\,\vec{w}^T\hat{\vec{R}}\mathit{Var}(\hat{\vec{\beta}})\ , \mbox{ |
| a length-}n\mbox{ vector}\\ |
| z_i &=& \frac{1}{m\,\vec{w}^T\vec{w}} |
| \cdot\frac{v^2_i}{\left[\mathit{Var}(\hat{\vec{\beta}})\right]_{ii}}\ |
| , \mbox{ for }i=1,\dots,n\ ,\\ |
| p_i &=& 1 - \chi_1^2(z_i), \mbox{ for }i=1,\dots,n\ . |
| \end{eqnarray} |
| Here $\bar{\hat{t}}$ is the average of the transformed survival |
| times. $z_i$ is the z-stats for $i$-th coefficient. $p_i$ is the |
| p-value for $i$-th coefficient. We need a separate function to compute |
| $\vec{w}$, but both $\vec{v}$ and $\vec{w}^T\vec{w}$ can be computed |
| in an aggregate function. $z_i$ and $p_i$ can be computed in the final |
| function of that aggregate function. $\chi^2_1(\cdot)$ is the |
| chi-square function with the degree of freedom being $1$. |
| |
| \subsection{Implementation} |
| We can use the iteration equations Eqs. (\ref{eq:numerator_avg}, |
| \ref{eq:denominator_avg}, \ref{eq:hessian}) to compute the residuals and the |
| hessian, which is needed by the variance matrix. |
| |
| The current implementation uses an ordered aggregate on the data to |
| compute these quantities, which is not in parallel. To enable a distributed |
| solution we can use the ``GROUP BY'' functionality provided by SQL to enable the |
| independent computation of the log-likelihood function in each distributed segment |
| corresponding to each strata (`transition' function). These values can then be added |
| across the segments (`merge' function), with the gradient for the parameter computed |
| on the final sum (`final' function). |
| |
| \subsection{Resolving Ties} |
| In ``coxph\_train'', Breslow method is used to resolve ties, which uses |
| as the partial likelihood~\cite{hosmer2011applied} |
| \begin{eqnarray} |
| L &=& \sum_{i=1}^{m}\left[ \vec{\beta}^{T} \vec{x}_{i+} - |
| d_i \log\left( \sum_{j \in R(t_i)} e^{\vec{\beta}^T |
| \vec{x}_j}\right) \right]\\ |
| &=& \sum_{i=1}^{m}\left[ \vec{\beta}^{T} \vec{x}_i - \log\left( \sum_{j \in R(t_i)} e^{\vec{\beta}^T |
| \vec{x}_j}\right)\right]_{+} |
| \end{eqnarray} |
| where $d_i$ is the number of observations that have the same $t_i$, |
| and $x_{i+}$ is the sum of the covariants over all observations that |
| have the same $t_i$. $m$ is the number of distinct survival |
| times. Here $[\cdot]_{+}$ means sum over all observations with the |
| same survival time. |
| |
| \subsection{Robust Variance Estimators} |
| In MADlib, we implement robust variance estimator devised by |
| Lin and Wei~\cite{lin1989robust}. |
| With our notations above, let |
| \begin{eqnarray*} |
| \vec{W}_{i} |
| &=& \delta_i \cdot \left[ \vec{x_i} - \frac{\left( \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} \vec{x_l} \right)}{\sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}}} \right] |
| - \sum_{j: t_j \le t_i} \left\{ \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{\sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}}} |
| \cdot \left[ \vec{x_i} - \frac{\left( \sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}} \vec{x_k} \right)}{\sum_{k: t_k \ge t_j} e^{\vec{\beta}^T \vec{x_k}}} \right] \right\}.\\ |
| &=& \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right] |
| - \sum_{j: t_j \le t_i} \left\{ \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} |
| \cdot \left[ \vec{x_i} - \frac{\vec{H}_j}{S_j} \right] \right\}.\\ |
| &=& \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right] |
| - \sum_{j: t_j \le t_i} \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} \vec{x_i} |
| + \sum_{j: t_j \le t_i} \delta_j \cdot \frac{e^{\vec{\beta}^T \vec{x_i}}}{S_j} \frac{\vec{H}_j}{S_j}.\\ |
| \end{eqnarray*} |
| where |
| \begin{eqnarray*} |
| H_i &=& \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} \vec{x_l}\\ |
| S_i &=& \sum_{l: t_l \ge t_i} e^{\vec{\beta}^T \vec{x_l}} |
| \end{eqnarray*} |
| |
| Let |
| \begin{eqnarray*} |
| A_i &=& \sum_{j: t_j \le t_i} \frac{\delta_j}{S_j},\\ |
| \vec{B}_i &=& \sum_{j: t_j \le t_i} \frac{\delta_j \vec{H}_j}{S_j^2},\\ |
| \end{eqnarray*} |
| we have |
| \[ |
| \vec{W}_i = \delta_i \cdot \left[ \vec{x_i} - \frac{\vec{H}_i}{S_i} \right] - e^{\vec{\beta}^T \vec{x_i}} A_i \vec{x_i} + e^{\vec{\beta}^T \vec{x_i}} \vec{B}_i. |
| \] |
| And the recursions of $A_i$ and $\vec{B}_i$ are given if we assume time $t_i$ is |
| sorted ascending order |
| \begin{eqnarray*} |
| A_i &=& A_{i-1} + \frac{\delta_i}{S_i}\\ |
| \vec{B}_i &=& \vec{B}_{i-1} + \frac{\delta_i \vec{H}_i}{S_i^2} |
| \end{eqnarray*} |
| |
| The meat part of the sandwich estimator is |
| \begin{equation} |
| \mathbb{M} = \sum_{i=1}^n\vec{W}_i\vec{W}_i^T\ . |
| \end{equation} |
| |
| \subsection{Clustered Variance Estimation} |
| |
| The data has multiple clusters $m=1,\dots,K$, and the meat part is |
| \begin{equation} |
| \mathbb{M} = \mathbb{A}^T\mathbb{A}\ , |
| \end{equation} |
| where the matrix $\mathbb{A}$'s $m$-th row is given by |
| \begin{equation} |
| \mathbb{A}_m = \sum_{i\in G_m} \vec{W}_i\ . |
| \end{equation} |
| Here $G_m$ is the set of rows of that belong to the $m$-th cluster. |
| |
| With stratification, we take into account of the strata only when we |
| compute $\vec{H}_i$, $S_i$, $A_i$, $\vec{B}_i$ and $\vec{W}_i$. The |
| calculations of $\mathbb{A}_m$ and $\mathbb{M}$ only need to group by |
| the clustered variables and the strata variables are irrelavent. |
| |
| \section{How to prevent under/over -flow errors ?} |
| |
| A problem that is not mentioned above but appears in the real |
| applications of the CoxPH training model is the under-flow or |
| over-flow errors. We have exponential functions in the computation, |
| and it is very easy to get under-flow or over-flow errors if the |
| coefficients become too small or large at certain steps. |
| |
| We use the same method as R's "survival" package to deal with the |
| possible under-flow/over-flow errors. This methods contains three |
| parts, which makes the algorithm described above even more |
| complicated: |
| |
| (1) center and scale the independent variables. |
| \begin{equation} |
| x_i \rightarrow \frac{x_i - E[x]}{E[\vert x_i - E[x] \vert]} |
| \end{equation} |
| |
| (2) Estimate the maximum possible value of all the coefficients using |
| the coefficients computed from the first iteration. |
| \begin{equation} |
| \beta_k^{(max)} = 20 \, \sqrt{\frac{h_{kk}}{\sum_i \delta_i}}\ , |
| \end{equation} |
| where $\beta_k^{(max)}$ is the estimate of the possible maximum value |
| of the coefficient $\beta_k$, $h_{kk} = \partial^2 L / \partial |
| \beta_k^2$ is the diagonal elements of the Hessian matrix, and |
| $\delta_i=0,1$ is the censoring status of the records. |
| |
| During the computation, whenever the coefficient $\vert \beta_k \vert |
| > \beta_k^{(max)}$, we set $\beta_k \rightarrow \mbox{sign}(\beta_k) |
| \beta_k^{(max)}$. |
| |
| The authors of "survival" package explains in |
| http://stat.ethz.ch/R-manual/R-devel/RHOME/library/survival/doc/sourcecode.pdf |
| why they use such an estimate for the coefficients: |
| |
| "We use a cutpoint of $\beta * \mbox{std}(x) < 23$ |
| where the standard deviation is the average standard deviation of $x$ within a risk |
| set. The rationale is that $e^{23}$ is greater than the current world |
| population, so such a coefficient corresponds to a between subject |
| relative risk that is larger than any imaginable." |
| |
| In their implementation, they also use $1/h_{kk}$ as an approximation |
| to $\mbox{std}(x)$. And besides $23$, they also use $20$ in the |
| estimate. |
| |
| (3) Although (1) and (2) stabalize the computation, it is still not |
| enough. Step-halving method is used. Whenever the current iteration's |
| log-likelihood is smaller than that of previous iteration, we accept |
| the coefficients as |
| \begin{equation} |
| \beta_k = \frac{1}{2}(\beta_k^{new} + \beta_k^{old}) |
| \end{equation} |
| |
| (4) The stopping threshold is |
| \begin{equation} |
| 1 - \frac{L_{new}}{L_{old}} < \mbox{threshold}\ . |
| \end{equation} |
| |
| \section{Marginal Effects} |
| |
| See \ref{sub:marginal_effects} for an introduction to marginal effects |
| (all notations below are the same as those defined in \ref{sub:marginal_effects}). |
| We implement the default algorithm used by Stata 13.1 for computing the |
| marginal effects. Note that older versions of Stata may use a |
| different default algorithm to compute values different from MADlib. |
| |
| \subsection{Basic Formulation} |
| |
| The relative hazard ratio for $i$-th record is given by |
| \begin{equation} |
| h_i = h_0 \exp \left( \vec{\beta}\cdot\vec{f} \right)\ , |
| \end{equation} |
| where $h_0$ is the baseline and both $\vec{\beta}$ and |
| $\vec{f}$ are vectors. Here we use the indices |
| $i$ or $j$ for the data records, and $a$ or $b$ for the indices of |
| covariate terms in $\vec{\beta}\cdot \vec{f}(\vec{x}_i)$. And we will use |
| $u$ or $v$ to denote the indices of $\vec{x}$. |
| |
| The value of the baseline $h_0$ is arbitrary and difficult to compute. Strata |
| ignores the baseline hazard value for the computation of marginal effect. For |
| MADlib, we follow the same principle and ignore the baseline (i.e. set baseline |
| as 1) to compute hazard value. |
| |
| Thus the marginal effect corresponding to variable $x_k$ is computed as, |
| \begin{align*} \label{eq:me1} |
| \mathit{ME}_{k} & = \pder[h_i]{x_k} \\ |
| & = \pder[e^{\vec{\beta} \vec{f}}]{x_k} \\ |
| & = e^{\vec{\beta} \vec{f}} \vec{\beta} \pder[f]{x_k}. |
| \end{align*} |
| Vectorizing the above equation (similar to \ref{ssub:logistic_regression}) gives |
| \begin{equation*} |
| \mathit{ME} = e^{\vec{\beta} \vec{f}} J^T \vec{\beta}, |
| \end{equation*} |
| where $J$ is defined in \ref{sub:marginal_effects_for_regression_methods}. |
| |
| |
| Censoring status and stratification are irrelavant to the marginal effect |
| calculation and can be ignored. |
| |
| \subsection{Categorical variables} |
| |
| For categorical variables, we compute the discrete difference as described in |
| \ref{sub:categorical_variables}. |
| The discrete difference with respect to $x_k$ is given as |
| \begin{align*} |
| \mathit{ME_k} &= h^{set} - h^{unset} \\ |
| &= e^{\vec{\beta}\vec{f^{set}}} - e^{\vec{\beta}\vec{f^{unset}}} |
| \end{align*} |
| |
| \subsection{Standard Error} |
| |
| As has already been described in \ref{sub:standard_errors}, the method to compute |
| the standard errors is |
| \begin{equation} |
| \mbox{Var}(ME) = \mathbb{S}\, \mbox{Var}(\vec{\beta})\, |
| \mathbb{S}^T, |
| \end{equation} |
| where the matrix $\mathbb{S}$, computed as the partial derivative of |
| the marginal effect over all the coefficients, is a $M\times N$ matrix $S_{mn} |
| = \pder[\mathit{ME}_m]{\beta_n}$ |
| |
| \begin{align*} |
| S_{mn} & = \pder[\mathit{ME}_m]{\beta_n} \\ |
| & = \pder{\beta_n} \left( e^{\vec{\beta} \vec{f}} \vec{\beta} \pder[\vec{f}]{x_m} \right)\\ |
| & = e^{\vec{\beta} \vec{f}} \pder{\beta_n}\left(\vec{\beta} \pder[\vec{f}]{x_m} \right) + |
| \pder[e^{\vec{\beta} \vec{f}}]{\beta_n} \vec{\beta} \pder[\vec{f}]{x_m} \\ |
| & = e^{\vec{\beta} \vec{f}} \pder[f_n]{x_m} + |
| e^{\vec{\beta} \vec{f}} \cdot f_n \cdot \vec{\beta} \pder[\vec{f}]{x_m} \\ |
| & = e^{\vec{\beta} \vec{f}} \left( \pder[f_n]{x_m} + |
| f_n \cdot \vec{\beta} \pder[\vec{f}]{x_m} \right). |
| \end{align*} |
| |
| Vectorizing this equation to express for the complete matrix, |
| \begin{equation*} |
| \vec{S} = e^{\vec{\beta} \vec{f}} \left(J^T + |
| (J^T \vec{\beta}) \vec{f}^T \right). |
| \end{equation*} |