| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL. |
| # The Breslow method is used for handling ties and the regression parameters |
| # are computed using trust region newton method with conjugate gradient |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Location to read the input matrix X containing the survival data containing the following information |
| # - 1: timestamps |
| # - 2: whether an event occurred (1) or data is censored (0) |
| # - 3: feature vectors |
| # TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row) |
| # F String " " Column indices of X as a column vector which are to be used for fitting the Cox model |
| # R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing |
| # the start and end indices of the factors in X |
| # - R[,1]: start indices |
| # - R[,2]: end indices |
| # Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X; |
| # in this case the start and end indices corresponding to the baseline level need to be the same; |
| # if R is not provided by default all variables are considered to be continuous |
| # M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted |
| # Cox model (the betas), their standard errors, confidence intervals, and P-values |
| # S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including |
| # no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare; |
| # by default is standard output |
| # T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model; |
| # by default is standard output |
| # COV String --- Location to store the variance-covariance matrix of the betas |
| # RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X |
| # XO String --- Location to store sorted input matrix by the timestamps |
| # MF String --- Location to store column indices of X excluding the baseline factors if available |
| # alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas |
| # tol Double 0.000001 Tolerance ("epsilon") |
| # moi Int 100 Max. number of outer (Newton) iterations |
| # mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max |
| # fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema: |
| # M[,1]: betas |
| # M[,2]: exp(betas) |
| # M[,3]: standard error of betas |
| # M[,4]: Z |
| # M[,5]: P-value |
| # M[,6]: lower 100*(1-alpha)% confidence interval of betas |
| # M[,7]: upper 100*(1-alpha)% confidence interval of betas |
| # |
| # Two matrices containing a summary of some statistics of the fitted model: |
| # 1- File S with the following format |
| # - row 1: no. of observations |
| # - row 2: no. of events |
| # - row 3: log-likelihood |
| # - row 4: AIC |
| # - row 5: Rsquare (Cox & Snell) |
| # - row 6: max possible Rsquare |
| # 2- File T with the following format |
| # - row 1: Likelihood ratio test statistic, degree of freedom, P-value |
| # - row 2: Wald test statistic, degree of freedom, P-value |
| # - row 3: Score (log-rank) test statistic, degree of freedom, P-value |
| # |
| # Additionally, the following matrices are stored (needed for prediction) |
| # 1- A column matrix RT that contains the order-preserving recoded timestamps from X |
| # 2- Matrix XO which is matrix X with sorted timestamps |
| # 3- Variance-covariance matrix of the betas COV |
| # 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available) |
| # ------------------------------------------------------------------------------------------- |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R |
| # M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT |
| # XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv |
| |
| fileX = $X; |
| fileTE = $TE; |
| fileRT = $RT; |
| fileMF = $MF; |
| fileM = $M; |
| fileXO = $XO; |
| fileCOV = $COV; |
| |
| # Default values of some parameters |
| fileF = ifdef ($F, " "); # $F=" " |
| fileR = ifdef ($R, " "); # $R=" " |
| fileS = ifdef ($S, " "); # $S=" " |
| fileT = ifdef ($T, " "); # $T=" " |
| fmtO = ifdef ($fmt, "text"); # $fmt="text" |
| alpha = ifdef ($alpha, 0.05); # $alpha=0.05 |
| tol = ifdef ($tol, 0.000001); # $tol=0.000001; |
| maxiter = ifdef ($moi, 100); # $moi=100; |
| maxinneriter = ifdef ($mii, 0); # $mii=0; |
| |
| X_orig = read (fileX); |
| |
| TE = read (fileTE); |
| if (fileF != " ") { |
| F = read (fileF); |
| } |
| |
| ######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET |
| |
| if (fileR != " ") { # factors available |
| R = read (fileR); |
| if (ncol (R) != 2) { |
| stop ("Matrix R has wrong dimensions!"); |
| } |
| print ("REMOVING BASLINE LEVEL OF EACH FACTOR..."); |
| # identify baseline columns to be removed from X_orig |
| col_sum = colSums (X_orig); |
| col_seq = t (seq(1, ncol (X_orig))); |
| parfor (i in 1:nrow (R), check = 0) { |
| start_ind = as.scalar (R[i,1]); |
| end_ind = as.scalar (R[i,2]); |
| baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1; |
| col_seq[,baseline_ind] = 0; |
| } |
| ones = matrix (1, rows = nrow (F), cols = 1); |
| F_filter = table (ones, F, 1, ncol (X_orig)); |
| F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols"); |
| TE_F = t(cbind (t (TE), F_filter)); |
| } else if (fileF != " ") { # all features scale |
| TE_F = t(cbind (t (TE), t(F))); |
| } else { # no features available |
| TE_F = TE; |
| } |
| |
| write (TE_F, fileMF, format = fmtO); |
| |
| X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F)); |
| |
| ######## RECODING TIMESTAMPS PRESERVING THE ORDER |
| print ("RECODING TIMESTAMPS..."); |
| |
| N = nrow (X_orig); |
| X_orig = order (target = X_orig, by = 1); |
| Idx = matrix (1, rows = N, cols = 1); |
| num_timestamps = 1; |
| if (N == 1) { |
| RT = matrix (1, rows = 1, cols = 1); |
| } else { |
| Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]); |
| num_timestamps = sum (Idx); |
| A = removeEmpty (target = diag (Idx), margin = "cols"); |
| if (ncol (A) > 1) { |
| A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)]; |
| B = cumsum (A); |
| RT = B %*% seq(1, ncol(B)); |
| } else { # there is only one group |
| RT = matrix (1, rows = N, cols = 1); |
| } |
| } |
| E = X_orig[,2]; |
| |
| print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT"); |
| |
| ######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT |
| # b: the regression betas |
| # o: loss function value |
| # g: loss function gradient |
| # H: loss function Hessian |
| # sb: shift of b in one iteration |
| # so: shift of o in one iteration |
| # r: CG residual = H %*% sb + g |
| # d: CG direction vector |
| # Hd: = H %*% d |
| # c: scalar coefficient in CG |
| # delta: trust region size |
| # tol: tolerance value |
| # i: outer (Newton) iteration count |
| # j: inner (CG) iteration count |
| |
| # computing initial coefficients b (all initialized to 0) |
| if (ncol (X_orig) > 2) { |
| X = X_orig[,3:ncol(X_orig)]; |
| D = ncol (X); |
| zeros_D = matrix (0, rows = D, cols = 1); |
| b = zeros_D; |
| } |
| d_r = aggregate (target = E, groups = RT, fn = "sum"); |
| e_r = aggregate (target = RT, groups = RT, fn = "count"); |
| |
| # computing initial loss function value o |
| num_distinct = nrow (d_r); # no. of distinct timestamps |
| e_r_rev_agg = cumsum (rev(e_r)); |
| d_r_rev = rev(d_r); |
| o = sum (d_r_rev * log (e_r_rev_agg)); |
| o_init = o; |
| if (ncol (X_orig) < 3) { |
| loglik = -o; |
| S_str = "no. of records " + N + " loglik " + loglik; |
| if (fileS != " ") { |
| S = matrix(0, 6, 1); |
| S[1, 1] = N; |
| S[2, 1] = 0; # number of events |
| S[3, 1] = loglik; |
| S[4, 1] = -1; # AIC |
| S[5, 1] = -1; # Rsquare |
| S[6, 1] = -1; #Rsquare_max |
| write (S, fileS, format = fmtO); |
| } else { |
| print (S_str); |
| } |
| stop ("No features are selected!"); |
| } |
| |
| # computing initial gradient g |
| # part 1 g0_1 |
| g0_1 = - t (colSums (X * E)); # g_1 |
| # part 2 g0_2 |
| X_rev_agg = cumsum (rev(X)); |
| select = table (seq (1, num_distinct), e_r_rev_agg); |
| X_agg = select %*% X_rev_agg; |
| g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg)); |
| # |
| g0 = g0_1 + g0_2; |
| g = g0; |
| |
| # initialization for trust region Newton method |
| delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); |
| initial_g2 = sum (g ^ 2); |
| exit_g2 = initial_g2 * tol ^ 2; |
| maxiter = 100; |
| maxinneriter = min (D, 100); |
| i = 0; |
| sum_g2 = sum (g ^ 2); |
| while (sum_g2 > exit_g2 & i < maxiter) { |
| i = i + 1; |
| sb = zeros_D; |
| r = g; |
| r2 = sum (r ^ 2); |
| exit_r2 = 0.01 * r2; |
| d = - r; |
| trust_bound_reached = FALSE; |
| j = 0; |
| |
| exp_Xb = exp (X %*% b); |
| exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); |
| D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator |
| X_exp_Xb = X * exp_Xb; |
| X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); |
| X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; |
| |
| while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { |
| j = j + 1; |
| # computing Hessian times d (Hd) |
| # part 1 Hd_1 |
| Xd = X %*% d; |
| X_Xd_exp_Xb = X * (Xd * exp_Xb); |
| X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb)); |
| X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg; |
| |
| Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev; |
| # part 2 Hd_2 |
| |
| Xd_exp_Xb = Xd * exp_Xb; |
| Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb)); |
| Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg; |
| |
| Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator |
| Hd_2 = Hd_2_num / (D_r_rev ^ 2); |
| |
| Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev)); |
| |
| c = r2 / sum (d * Hd); |
| [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2); |
| sb = sb + c * d; |
| r = r + c * Hd; |
| r2_new = sum (r ^ 2); |
| d = - r + (r2_new / r2) * d; |
| r2 = r2_new; |
| } |
| |
| # computing loss change in 1 iteration (so) |
| # part 1 so_1 |
| so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); |
| # part 2 so_2 |
| exp_Xbsb = exp (X %*% (b + sb)); |
| exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum"); |
| so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg)))); |
| # |
| so = so_1 + so_2; |
| so = so - o; |
| |
| delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g))); |
| if (so < 0) { |
| b = b + sb; |
| o = o + so; |
| # compute new gradient g |
| exp_Xb = exp (X %*% b); |
| exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); |
| X_exp_Xb = X * exp_Xb; |
| X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); |
| X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; |
| |
| D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator |
| g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev)); |
| g = g0_1 + g_2; |
| sum_g2 = sum (g ^ 2); |
| } |
| } |
| |
| if (sum_g2 > exit_g2 & i >= maxiter) { |
| print ("Trust region Newton method did not converge!"); |
| } |
| |
| |
| print ("COMPUTING HESSIAN..."); |
| |
| H0 = matrix (0, rows = D, cols = D); |
| H = matrix (0, rows = D, cols = D); |
| |
| X_exp_Xb_rev_2 = rev(X_exp_Xb); |
| X_rev_2 = rev(X); |
| |
| X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); |
| X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; |
| |
| parfor (i in 1:D, check = 0) { |
| Xi = X[,i]; |
| Xi_rev = rev(Xi); |
| |
| ## ----------Start calculating H0-------------- |
| # part 1 H0_1 |
| Xi_X = X_rev_2[,i:D] * Xi_rev; |
| Xi_X_rev_agg = cumsum (Xi_X); |
| Xi_X_rev_agg = select %*% Xi_X_rev_agg; |
| H0_1 = Xi_X_rev_agg / e_r_rev_agg; |
| |
| # part 2 H0_2 |
| Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum"); |
| Xi_agg_rev_agg = cumsum (rev(Xi_agg)); |
| H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator |
| H0_2 = H0_2_num / (e_r_rev_agg ^ 2); |
| |
| H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev); |
| #-----------End calculating H0-------------------- |
| |
| ## ----------Start calculating H-------------- |
| # part 1 H_1 |
| Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev; |
| Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb); |
| Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg; |
| H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev; |
| |
| # part 2 H_2 |
| Xi_exp_Xb = exp_Xb * Xi; |
| Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum"); |
| |
| Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg)); |
| H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator |
| H_2 = H_2_num / (D_r_rev ^ 2); |
| H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev); |
| #-----------End calculating H-------------------- |
| } |
| H = H + t(H) - diag( diag (H)); |
| H0 = H0 + t(H0) - diag( diag (H0)); |
| |
| |
| # compute standard error for betas |
| H_inv = inv (H); |
| se_b = sqrt (diag (H_inv)); |
| |
| # compute exp(b), Z, Pr[>|Z|] |
| exp_b = exp (b); |
| Z = b / se_b; |
| P = matrix (0, rows = D, cols = 1); |
| parfor (i in 1:D) { |
| P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal")); |
| } |
| |
| # compute confidence intervals for b |
| z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal"); |
| CI_l = b - se_b * z_alpha_2; |
| CI_r = b - se_b + z_alpha_2; |
| |
| ######## SOME STATISTICS AND TESTS |
| S = matrix(0, 6, 1); |
| T = matrix(0, 3, 3); |
| |
| # no. of records |
| S_str = "no. of records " + N; |
| S[1, 1] = N; |
| |
| # no.of events |
| S_str = append (S_str, "no. of events " + sum (E)); |
| S[2, 1] = sum (E); |
| |
| # log-likelihood |
| loglik = -o; |
| S_str = append (S_str, "loglik " + loglik + " "); |
| S[3, 1] = loglik; |
| |
| # AIC = -2 * loglik + 2 * D |
| AIC = -2 * loglik + 2 * D; |
| S_str = append (S_str, "AIC " + AIC + " "); |
| S[4, 1] = AIC; |
| |
| # Wald test |
| wald_t = as.scalar (t(b) %*% H %*% b); |
| wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D); |
| T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " "; |
| T[1, 1] = wald_t; |
| T[1, 2] = D; |
| T[1, 3] = wald_p; |
| |
| # Likelihood ratio test |
| lratio_t = 2 * o_init - 2 * o; |
| lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D); |
| T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " "); |
| |
| T[2, 1] = lratio_t; |
| T[2, 2] = D; |
| T[2, 3] = lratio_p; |
| |
| H0_inv = inv (H0); |
| score_t = as.scalar (t (g0) %*% H0_inv %*% g0); |
| score_p = 1 - cdf (target = score_t, dist = "chisq", df = D); |
| T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " "); |
| T[3, 1] = score_t; |
| T[3, 2] = D; |
| T[3, 3] = score_p; |
| |
| # Rsquare (Cox & Snell) |
| Rsquare = 1 - exp (-lratio_t / N); |
| Rsquare_max = 1 - exp (-2 * o_init / N); |
| S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " "); |
| S[5, 1] = Rsquare; |
| S_str = append (S_str, "max possible Rsquare: " + Rsquare_max); |
| S[6, 1] = Rsquare_max; |
| |
| M = matrix (0, rows = D, cols = 7); |
| M[,1] = b; |
| M[,2] = exp_b; |
| M[,3] = se_b; |
| M[,4] = Z; |
| M[,5] = P; |
| M[,6] = CI_l; |
| M[,7] = CI_r; |
| |
| write (M, fileM, format = fmtO); |
| if (fileS != " ") { |
| write (S, fileS, format = fmtO); |
| } else { |
| print (S_str); |
| } |
| if (fileT != " ") { |
| write (T, fileT, format = fmtO); |
| } else { |
| print (T_str); |
| } |
| # needed for prediction |
| write (RT, fileRT, format = fmtO); |
| write (H_inv, fileCOV, format = fmtO); |
| write (X_orig, fileXO, format = fmtO); |
| |
| |
| ####### UDFS FOR TRUST REGION NEWTON METHOD |
| |
| ensure_trust_bound = |
| function (double x, double a, double b, double c) |
| return (double x_new, boolean is_violated) |
| { |
| if (a * x^2 + b * x + c > 0) |
| { |
| is_violated = TRUE; |
| rad = sqrt (b ^ 2 - 4 * a * c); |
| if (b >= 0) { |
| x_new = - (2 * c) / (b + rad); |
| } else { |
| x_new = - (b - rad) / (2 * a); |
| } |
| } else { |
| is_violated = FALSE; |
| x_new = x; |
| } |
| } |
| |
| update_trust_bound = |
| function (double delta, |
| double sb_distance, |
| double so_exact, |
| double so_linear_approx, |
| double so_quadratic_approx) |
| return (double delta) |
| { |
| sigma1 = 0.25; |
| sigma2 = 0.5; |
| sigma3 = 4.0; |
| |
| if (so_exact <= so_linear_approx) { |
| alpha = sigma3; |
| } else { |
| alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx)); |
| } |
| |
| rho = so_exact / so_quadratic_approx; |
| if (rho < 0.0001) { |
| delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta); |
| } else { if (rho < 0.25) { |
| delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta)); |
| } else { if (rho < 0.75) { |
| delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta)); |
| } else { |
| delta = max (delta, min (alpha * sb_distance, sigma3 * delta)); |
| }}} |
| } |