| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL. |
| # The Breslow method is used for handling ties and the regression parameters |
| # are computed using trust region newton method with conjugate gradient |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Matrix --- Location to read the input matrix X containing the survival data |
| # containing the following information |
| # 1: timestamps |
| # 2: whether an event occurred (1) or data is censored (0) |
| # 3: feature vectors |
| # TE Matrix --- Column indices of X as a column vector which contain timestamp |
| # (first row) and event information (second row) |
| # F Matrix --- Column indices of X as a column vector which are to be used for |
| # fitting the Cox model |
| # R Matrix --- If factors (categorical variables) are available in the input matrix |
| # X, location to read matrix R containing the start and end indices of |
| # the factors in X |
| # R[,1]: start indices |
| # R[,2]: end indices |
| # Alternatively, user can specify the indices of the baseline level of |
| # each factor which needs to be removed from X; in this case the start |
| # and end indices corresponding to the baseline level need to be the same; |
| # if R is not provided by default all variables are considered to be continuous |
| # alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas |
| # tol Double 0.000001 Tolerance ("epsilon") |
| # moi Int 100 Max. number of outer (Newton) iterations |
| # mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema: |
| # M[,1]: betas |
| # M[,2]: exp(betas) |
| # M[,3]: standard error of betas |
| # M[,4]: Z |
| # M[,5]: P-value |
| # M[,6]: lower 100*(1-alpha)% confidence interval of betas |
| # M[,7]: upper 100*(1-alpha)% confidence interval of betas |
| # |
| # Two matrices containing a summary of some statistics of the fitted model: |
| # 1 - File S with the following format |
| # - row 1: no. of observations |
| # - row 2: no. of events |
| # - row 3: log-likelihood |
| # - row 4: AIC |
| # - row 5: Rsquare (Cox & Snell) |
| # - row 6: max possible Rsquare |
| # 2 - File T with the following format |
| # - row 1: Likelihood ratio test statistic, degree of freedom, P-value |
| # - row 2: Wald test statistic, degree of freedom, P-value |
| # - row 3: Score (log-rank) test statistic, degree of freedom, P-value |
| # |
| # Additionally, the following matrices are stored (needed for prediction) |
| # 1- A column matrix RT that contains the order-preserving recoded timestamps from X |
| # 2- Matrix XO which is matrix X with sorted timestamps |
| # 3- Variance-covariance matrix of the betas COV |
| # 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available) |
| # ------------------------------------------------------------------------------------------- |
| |
| m_cox = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] F, Matrix[Double] R, |
| Double alpha = 0.05, Double tol = 0.000001, Int moi = 100, Int mii = 0) |
| return (Matrix[Double] M, Matrix[Double] S, Matrix[Double] T, Matrix[Double] COV, Matrix[Double] RT, Matrix[Double] XO) { |
| |
| X_orig = X; |
| R_1_1 = as.scalar(R[1,1]); |
| |
| ######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET |
| |
| if (R_1_1 != 0) { # factors available |
| if (ncol (R) != 2) { |
| stop ("Matrix R has wrong dimensions!"); |
| } |
| print ("REMOVING BASELINE LEVEL OF EACH FACTOR..."); |
| # identify baseline columns to be removed from X_orig |
| col_sum = colSums (X_orig); |
| col_seq = t (seq(1, ncol (X_orig))); |
| |
| parfor (i in 1:nrow (R), check = 0) { |
| start_ind = as.scalar (R[i,1]); |
| end_ind = as.scalar (R[i,2]); |
| baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1; |
| col_seq[,baseline_ind] = 0; |
| } |
| ones = matrix (1, rows = nrow (F), cols = 1); |
| F_filter = table (ones, F, 1, ncol (X_orig)); |
| F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols"); |
| TE_F = t(cbind (t (TE), F_filter)); |
| } else if (ncol(F) > 0 & nrow(F) > 0) { # all features scale |
| TE_F = t(cbind (t (TE), t(F))); |
| } else { # no features available |
| TE_F = TE; |
| } |
| |
| X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F)); |
| |
| ######## RECODING TIMESTAMPS PRESERVING THE ORDER |
| print ("RECODING TIMESTAMPS..."); |
| |
| N = nrow (X_orig); |
| X_orig = order (target = X_orig, by = 1); |
| Idx = matrix (1, rows = N, cols = 1); |
| num_timestamps = 1; |
| if (N == 1) |
| RT = matrix (1, rows = 1, cols = 1); |
| else { |
| Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]); |
| num_timestamps = sum (Idx); |
| A = removeEmpty (target = diag (Idx), margin = "cols"); |
| if (ncol (A) > 1) { |
| A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)]; |
| B = cumsum (A); |
| RT = B %*% seq(1, ncol(B)); |
| } else { # there is only one group |
| RT = matrix (1, rows = N, cols = 1); |
| } |
| } |
| E = X_orig[,2]; |
| |
| print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT"); |
| |
| ######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT |
| # b: the regression betas |
| # o: loss function value |
| # g: loss function gradient |
| # H: loss function Hessian |
| # sb: shift of b in one iteration |
| # so: shift of o in one iteration |
| # r: CG residual = H %*% sb + g |
| # d: CG direction vector |
| # Hd: = H %*% d |
| # c: scalar coefficient in CG |
| # delta: trust region size |
| # tol: tolerance value |
| # i: outer (Newton) iteration count |
| # j: inner (CG) iteration count |
| |
| # computing initial coefficients b (all initialised to 0) |
| if (ncol (X_orig) > 2) { |
| X = X_orig[,3:ncol(X_orig)]; |
| D = ncol (X); |
| zeros_D = matrix (0, rows = D, cols = 1); |
| b = zeros_D; |
| } |
| d_r = aggregate (target = E, groups = RT, fn = "sum"); |
| e_r = aggregate (target = RT, groups = RT, fn = "count"); |
| |
| # computing initial loss function value o |
| num_distinct = nrow (d_r); # no. of distinct timestamps |
| e_r_rev_agg = cumsum (rev(e_r)); |
| d_r_rev = rev(d_r); |
| o = sum (d_r_rev * log (e_r_rev_agg)); |
| o_init = o; |
| |
| if (ncol (X_orig) < 3) { |
| loglik = -o; |
| S = matrix(0, 6, 1); |
| S[1, 1] = N; |
| S[2, 1] = 0; # number of events |
| S[3, 1] = loglik; |
| S[4, 1] = -1; # AIC |
| S[5, 1] = -1; # Rsquare |
| S[6, 1] = -1; #Rsquare_max |
| |
| stop ("No features are selected!"); |
| } |
| |
| # computing initial gradient g |
| # part 1 g0_1 |
| g0_1 = - t (colSums (X * E)); # g_1 |
| # part 2 g0_2 |
| X_rev_agg = cumsum (rev(X)); |
| select = table (seq (1, num_distinct), e_r_rev_agg); |
| X_agg = select %*% X_rev_agg; |
| g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg)); |
| g0 = g0_1 + g0_2; |
| g = g0; |
| |
| # initialization for trust region Newton method |
| delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); |
| initial_g2 = sum (g ^ 2); |
| exit_g2 = initial_g2 * tol ^ 2; |
| maxiter = 100; |
| maxinneriter = min (D, 100); |
| i = 0; |
| sum_g2 = sum (g ^ 2); |
| |
| while (sum_g2 > exit_g2 & i < maxiter) { |
| i = i + 1; |
| sb = zeros_D; |
| r = g; |
| r2 = sum (r ^ 2); |
| exit_r2 = 0.01 * r2; |
| d = - r; |
| trust_bound_reached = FALSE; |
| j = 0; |
| |
| exp_Xb = exp (X %*% b); |
| exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); |
| D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator |
| X_exp_Xb = X * exp_Xb; |
| X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); |
| X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; |
| |
| while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { |
| j = j + 1; |
| # computing Hessian times d (Hd) |
| # part 1 Hd_1 |
| Xd = X %*% d; |
| X_Xd_exp_Xb = X * (Xd * exp_Xb); |
| X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb)); |
| X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg; |
| |
| Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev; |
| # part 2 Hd_2 |
| Xd_exp_Xb = Xd * exp_Xb; |
| Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb)); |
| Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg; |
| |
| Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator |
| Hd_2 = Hd_2_num / (D_r_rev ^ 2); |
| |
| Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev)); |
| |
| c = r2 / sum (d * Hd); |
| [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2); |
| sb = sb + c * d; |
| r = r + c * Hd; |
| r2_new = sum (r ^ 2); |
| d = - r + (r2_new / r2) * d; |
| r2 = r2_new; |
| } |
| |
| # computing loss change in 1 iteration (so) |
| # part 1 so_1 |
| so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); |
| # part 2 so_2 |
| exp_Xbsb = exp (X %*% (b + sb)); |
| exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum"); |
| so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg)))); |
| |
| so = so_1 + so_2; |
| so = so - o; |
| |
| delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g))); |
| if (so < 0) { |
| b = b + sb; |
| o = o + so; |
| # compute new gradient g |
| exp_Xb = exp (X %*% b); |
| exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); |
| X_exp_Xb = X * exp_Xb; |
| X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); |
| X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; |
| |
| D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator |
| g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev)); |
| g = g0_1 + g_2; |
| sum_g2 = sum (g ^ 2); |
| } |
| } |
| |
| if (sum_g2 > exit_g2 & i >= maxiter) { |
| print ("Trust region Newton method did not converge!"); |
| } |
| |
| print ("COMPUTING HESSIAN..."); |
| |
| H0 = matrix (0, rows = D, cols = D); |
| H = matrix (0, rows = D, cols = D); |
| |
| X_exp_Xb_rev_2 = rev(X_exp_Xb); |
| X_rev_2 = rev(X); |
| |
| X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); |
| X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; |
| |
| parfor (i in 1:D, check = 0) { |
| Xi = X[,i]; |
| Xi_rev = rev(Xi); |
| |
| ## ----------Start calculating H0-------------- |
| # part 1 H0_1 |
| Xi_X = X_rev_2[,i:D] * Xi_rev; |
| Xi_X_rev_agg = cumsum (Xi_X); |
| Xi_X_rev_agg = select %*% Xi_X_rev_agg; |
| H0_1 = Xi_X_rev_agg / e_r_rev_agg; |
| # part 2 H0_2 |
| Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum"); |
| Xi_agg_rev_agg = cumsum (rev(Xi_agg)); |
| H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator |
| H0_2 = H0_2_num / (e_r_rev_agg ^ 2); |
| H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev); |
| |
| #-----------End calculating H0-------------------- |
| |
| ## ----------Start calculating H-------------- |
| # part 1 H_1 |
| Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev; |
| Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb); |
| Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg; |
| H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev; |
| # part 2 H_2 |
| Xi_exp_Xb = exp_Xb * Xi; |
| Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum"); |
| Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg)); |
| H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator |
| H_2 = H_2_num / (D_r_rev ^ 2); |
| H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev); |
| #-----------End calculating H-------------------- |
| } |
| |
| H = H + t(H) - diag( diag (H)); |
| H0 = H0 + t(H0) - diag( diag (H0)); |
| |
| # compute standard error for betas |
| H_inv = inv (H); |
| se_b = sqrt (diag (H_inv)); |
| |
| # compute exp(b), Z, Pr[>|Z|] |
| P = matrix (0, rows = D, cols = 1); |
| exp_b = exp (b); |
| Z = b / se_b; |
| |
| for (i in 1:D) { |
| Z_i_1_scalar = abs (as.scalar (Z[i,1])); |
| #FIXME: enable after test setup has been fixed |
| #P[i,1] = cdf (target = Z_i_1_scalar, dist = "normal"); |
| } |
| |
| # compute confidence intervals for b |
| z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal"); |
| CI_l = b - se_b * z_alpha_2; |
| CI_r = b - se_b + z_alpha_2; |
| |
| ######## SOME STATISTICS AND TESTS |
| S = matrix(0, 6, 1); |
| |
| # no. of records |
| S[1, 1] = N; |
| |
| # no.of events |
| S[2, 1] = sum (E); |
| |
| # log-likelihood |
| loglik = -o; |
| S[3, 1] = loglik; |
| |
| # AIC |
| AIC = -2 * loglik + 2 * D; |
| S[4, 1] = AIC; |
| |
| # Likelihood ratio test |
| lratio_t = 2 * o_init - 2 * o; |
| lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D); |
| |
| # Rsquare (Cox & Snell) |
| Rsquare = 1 - exp (-lratio_t / N); |
| Rsquare_max = 1 - exp (-2 * o_init / N); |
| S[5, 1] = Rsquare; |
| S[6, 1] = Rsquare_max; |
| |
| T = matrix(0, 3, 3); |
| # Wald test |
| wald_t = as.scalar (t(b) %*% H %*% b); |
| wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D); |
| T[1, 1] = wald_t; |
| T[1, 2] = D; |
| T[1, 3] = wald_p; |
| |
| # Likelihood ratio test |
| T[2, 1] = lratio_t; |
| T[2, 2] = D; |
| T[2, 3] = lratio_p; |
| |
| H0_inv = inv (H0); |
| score_t = as.scalar (t (g0) %*% H0_inv %*% g0); |
| score_p = 1 - cdf (target = score_t, dist = "chisq", df = D); |
| # T[3, 1] = score_t; |
| T[3, 2] = D; |
| # T[3, 3] = score_p; |
| |
| M = matrix (1, rows = D, cols = 7); |
| M[, 1] = b; |
| M[, 2] = exp_b; |
| # M[, 3] = se_b; |
| # M[, 4] = Z; |
| # M[, 5] = P; |
| # M[, 6] = CI_l; |
| # M[, 7] = CI_r; |
| |
| COV = H; |
| XO = X_orig; |
| } |
| |
| ensure_trust_bound = function (double x, double a, double b, double c) |
| return (double x_new, boolean is_violated) |
| { |
| if (a * x^2 + b * x + c > 0) { |
| is_violated = TRUE; |
| rad = sqrt (b ^ 2 - 4 * a * c); |
| if (b >= 0) |
| x_new = - (2 * c) / (b + rad); |
| else |
| x_new = - (b - rad) / (2 * a); |
| } else { |
| is_violated = FALSE; |
| x_new = x; |
| } |
| } |
| |
| update_trust_bound = function (double delta, double sb_distance, |
| double so_exact, double so_linear_approx, double so_quadratic_approx) |
| return (double delta) |
| { |
| sigma1 = 0.25; |
| sigma2 = 0.5; |
| sigma3 = 4.0; |
| |
| if (so_exact <= so_linear_approx) |
| alpha = sigma3; |
| else |
| alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx)); |
| |
| rho = so_exact / so_quadratic_approx; |
| if (rho < 0.0001) { |
| delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta); |
| } else { |
| if (rho < 0.25) { |
| delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta)); |
| } else { |
| if (rho < 0.75) { |
| delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta)); |
| } else { |
| delta = max (delta, min (alpha * sb_distance, sigma3 * delta)); |
| } |
| } |
| } |
| } |