scripts/algorithms/Cox.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
 # The Breslow method is used for handling ties and the regression parameters
 # are computed using trust region newton method with conjugate gradient
 #
 # INPUT   PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME    TYPE     DEFAULT      MEANING
 # ---------------------------------------------------------------------------------------------
 # X       String   ---          Location to read the input matrix X containing the survival data containing the following information
 # 								 - 1: timestamps
 #								 - 2: whether an event occurred (1) or data is censored (0)
 #								 - 3: feature vectors
 # TE	  String   ---          Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
 # F 	  String   " "			Column indices of X as a column vector which are to be used for fitting the Cox model
 # R   	  String   " "	        If factors (categorical variables) are available in the input matrix X, location to read matrix R containing
 #								the start and end indices of the factors in X
 #								 - R[,1]: start indices
 #								 - R[,2]: end indices
 #								Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X;
 #								in this case the start and end indices corresponding to the baseline level need to be the same;
 #								if R is not provided by default all variables are considered to be continuous
 # M       String   ---          Location to store the results of Cox regression analysis including estimated regression parameters of the fitted
 #								Cox model (the betas), their standard errors, confidence intervals, and P-values
 # S       String   " "          Location to store a summary of some statistics of the fitted cox proportional hazard model including
 #								no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare;
 #								by default is standard output
 # T       String   " "          Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
 #								by default is standard output
 # COV	  String   ---			Location to store the variance-covariance matrix of the betas
 # RT      String   ---			Location to store matrix RT containing the order-preserving recoded timestamps from X
 # XO      String   ---			Location to store sorted input matrix by the timestamps
 # MF      String   ---          Location to store column indices of X excluding the baseline factors if available
 # alpha   Double   0.05         Parameter to compute a 100*(1-alpha)% confidence interval for the betas
 # tol     Double   0.000001     Tolerance ("epsilon")
 # moi     Int      100     		Max. number of outer (Newton) iterations
 # mii     Int      0      		Max. number of inner (conjugate gradient) iterations, 0 = no max
 # fmt     String   "text"       Matrix output format, usually "text" or "csv" (for matrices only)
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
 #	M[,1]: betas
 #	M[,2]: exp(betas)
 #	M[,3]: standard error of betas
 #	M[,4]: Z
 #	M[,5]: P-value
 #	M[,6]: lower 100*(1-alpha)% confidence interval of betas
 #	M[,7]: upper 100*(1-alpha)% confidence interval of betas
 #
 # Two matrices containing a summary of some statistics of the fitted model:
 # 1- File S with the following format
 #	- row 1: no. of observations
 #	- row 2: no. of events
 #   - row 3: log-likelihood
 #	- row 4: AIC
 #	- row 5: Rsquare (Cox & Snell)
 #	- row 6: max possible Rsquare
 # 2- File T with the following format
 #	- row 1: Likelihood ratio test statistic, degree of freedom, P-value
 #	- row 2: Wald test statistic, degree of freedom, P-value
 #	- row 3: Score (log-rank) test statistic, degree of freedom, P-value
 #
 # Additionally, the following matrices are stored (needed for prediction)
 # 1- A column matrix RT that contains the order-preserving recoded timestamps from X
 # 2- Matrix XO which is matrix X with sorted timestamps
 # 3- Variance-covariance matrix of the betas COV
 # 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
 # -------------------------------------------------------------------------------------------
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemDS.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R
 #									  M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT
 #									  XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv

 fileX = $X;
 fileTE = $TE;
 fileRT = $RT;
 fileMF = $MF;
 fileM = $M;
 fileXO = $XO;
 fileCOV = $COV;

 # Default values of some parameters
 fileF = ifdef ($F, " ");			 # $F=" "
 fileR = ifdef ($R, " ");         	 # $R=" "
 fileS = ifdef ($S, " ");         	 # $S=" "
 fileT = ifdef ($T, " ");         	 # $T=" "
 fmtO = ifdef ($fmt, "text");         # $fmt="text"
 alpha = ifdef ($alpha, 0.05);        # $alpha=0.05
 tol = ifdef ($tol, 0.000001);   	 # $tol=0.000001;
 maxiter = ifdef ($moi, 100);    	 # $moi=100;
 maxinneriter = ifdef ($mii, 0);      # $mii=0;

 X_orig = read (fileX);

 TE = read (fileTE);
 if (fileF != " ") {
 	F = read (fileF);
 }

 ######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET

 if (fileR != " ") { # factors available
 	R = read (fileR);
 	if (ncol (R) != 2) {
 		stop ("Matrix R has wrong dimensions!");
 	}
 	print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");
 	# identify baseline columns to be removed from X_orig
 	col_sum = colSums (X_orig);
 	col_seq = t (seq(1, ncol (X_orig)));
 	parfor (i in 1:nrow (R), check = 0) {
 		start_ind = as.scalar (R[i,1]);
 		end_ind = as.scalar (R[i,2]);
 		baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
 		col_seq[,baseline_ind] = 0;
 	}
 	ones = matrix (1, rows = nrow (F), cols = 1);
 	F_filter = table (ones, F, 1, ncol (X_orig));
 	F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
 	TE_F = t(cbind (t (TE), F_filter));
 } else if (fileF != " ") { # all features scale
 	TE_F = t(cbind (t (TE), t(F)));
 } else { # no features available
 	TE_F = TE;
 }

 write (TE_F, fileMF, format = fmtO);

 X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));

 ######## RECODING TIMESTAMPS PRESERVING THE ORDER
 print ("RECODING TIMESTAMPS...");

 N = nrow (X_orig);
 X_orig = order (target = X_orig, by = 1);
 Idx = matrix (1, rows = N, cols = 1);
 num_timestamps = 1;
 if (N == 1) {
 	RT = matrix (1, rows = 1, cols = 1);
 } else {
 	Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
 	num_timestamps = sum (Idx);
 	A = removeEmpty (target = diag (Idx), margin = "cols");
 	if (ncol (A) > 1) {
 		A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
 		B = cumsum (A);
 		RT = B %*% seq(1, ncol(B));
 	} else { # there is only one group
 		RT = matrix (1, rows = N, cols = 1);
 	}
 }
 E = X_orig[,2];

 print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");

 ######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
 #  b: the regression betas
 #  o: loss function value
 #  g: loss function gradient
 #  H: loss function Hessian
 # sb: shift of b in one iteration
 # so: shift of o in one iteration
 #  r: CG residual = H %*% sb + g
 #  d: CG direction vector
 # Hd: = H %*% d
 #  c: scalar coefficient in CG
 # delta: trust region size
 # tol: tolerance value
 #  i: outer (Newton) iteration count
 #  j: inner (CG) iteration count

 # computing initial coefficients b (all initialized to 0)
 if (ncol (X_orig) > 2) {
 	X = X_orig[,3:ncol(X_orig)];
 	D = ncol (X);
 	zeros_D = matrix (0, rows = D, cols = 1);
 	b = zeros_D;
 }
 d_r = aggregate (target = E, groups = RT, fn = "sum");
 e_r = aggregate (target = RT, groups = RT, fn = "count");

 # computing initial loss function value o
 num_distinct = nrow (d_r); # no. of distinct timestamps
 e_r_rev_agg = cumsum (rev(e_r));
 d_r_rev = rev(d_r);
 o = sum (d_r_rev * log (e_r_rev_agg));
 o_init = o;
 if (ncol (X_orig) < 3) {
 	loglik = -o;
 	S_str = "no. of records " + N + " loglik " + loglik;
 	if (fileS != " ") {
 	  S = matrix(0, 6, 1);
 	  S[1, 1] = N;
 	  S[2, 1] = 0; # number of events
 	  S[3, 1] = loglik;
 	  S[4, 1] = -1; # AIC
 	  S[5, 1] = -1; # Rsquare
 	  S[6, 1] = -1; #Rsquare_max
 		write (S, fileS, format = fmtO);
 	} else {
 		print (S_str);
 	}
 	stop ("No features are selected!");
 }

 # computing initial gradient g
 # part 1 g0_1
 g0_1 = - t (colSums (X * E)); # g_1
 # part 2 g0_2
 X_rev_agg = cumsum (rev(X));
 select = table (seq (1, num_distinct), e_r_rev_agg);
 X_agg = select %*% X_rev_agg;
 g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
 #
 g0 = g0_1 + g0_2;
 g = g0;

 # initialization for trust region Newton method
 delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
 initial_g2 = sum (g ^ 2);
 exit_g2 = initial_g2 * tol ^ 2;
 maxiter = 100;
 maxinneriter = min (D, 100);
 i = 0;
 sum_g2 = sum (g ^ 2);
 while (sum_g2 > exit_g2 & i < maxiter) {
 	i = i + 1;
     sb = zeros_D;
     r = g;
     r2 = sum (r ^ 2);
     exit_r2 = 0.01 * r2;
     d = - r;
     trust_bound_reached = FALSE;
     j = 0;

 	exp_Xb = exp (X %*% b);
 	exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
 	D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
 	X_exp_Xb = X * exp_Xb;
 	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
 	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

     while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
         j = j + 1;
 		# computing Hessian times d (Hd)
 		# part 1 Hd_1
 		Xd = X %*% d;
 		X_Xd_exp_Xb = X * (Xd * exp_Xb);
 		X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
 		X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;

 		Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
 		# part 2 Hd_2

 		Xd_exp_Xb = Xd * exp_Xb;
 		Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
 		Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;

 		Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
 		Hd_2 = Hd_2_num / (D_r_rev ^ 2);

 		Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));

 		c = r2 / sum (d * Hd);
         [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
         sb = sb + c * d;
         r = r + c * Hd;
         r2_new = sum (r ^ 2);
         d = - r + (r2_new / r2) * d;
         r2 = r2_new;
     }

     # computing loss change in 1 iteration (so)
 	# part 1 so_1
 	so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
 	# part 2 so_2
 	exp_Xbsb = exp (X %*% (b + sb));
 	exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
 	so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
 	#
 	so = so_1 + so_2;
 	so = so - o;

 	delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
     if (so < 0) {
         b = b + sb;
         o = o + so;
 		# compute new gradient g
 		exp_Xb = exp (X %*% b);
 		exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
 		X_exp_Xb = X * exp_Xb;
 		X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
 		X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

 		D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
 		g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
 		g = g0_1 + g_2;
 		sum_g2 = sum (g ^ 2);
     }
 }

 if (sum_g2 > exit_g2 & i >= maxiter) {
 	print ("Trust region Newton method did not converge!");
 }


 print ("COMPUTING HESSIAN...");

 H0 = matrix (0, rows = D, cols = D);
 H = matrix (0, rows = D, cols = D);

 X_exp_Xb_rev_2 = rev(X_exp_Xb);
 X_rev_2 = rev(X);

 X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
 X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

 parfor (i in 1:D, check = 0) {
 	Xi = X[,i];
 	Xi_rev = rev(Xi);

 	## ----------Start calculating H0--------------
 	# part 1 H0_1
 	Xi_X = X_rev_2[,i:D] * Xi_rev;
 	Xi_X_rev_agg = cumsum (Xi_X);
 	Xi_X_rev_agg = select %*% Xi_X_rev_agg;
 	H0_1 = Xi_X_rev_agg / e_r_rev_agg;

 	# part 2 H0_2
 	Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
 	Xi_agg_rev_agg = cumsum (rev(Xi_agg));
 	H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
 	H0_2 = H0_2_num / (e_r_rev_agg ^ 2);

 	H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
 	#-----------End calculating H0--------------------

 	## ----------Start calculating H--------------
 	# part 1 H_1
 	Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
 	Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
 	Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
 	H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;

 	# part 2 H_2
 	Xi_exp_Xb = exp_Xb * Xi;
 	Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");

 	Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
 	H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
 	H_2 = H_2_num / (D_r_rev ^ 2);
 	H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
 	#-----------End calculating H--------------------
 }
 H = H + t(H) - diag( diag (H));
 H0 = H0 + t(H0) - diag( diag (H0));


 # compute standard error for betas
 H_inv = inv (H);
 se_b = sqrt (diag (H_inv));

 # compute exp(b), Z, Pr[>|Z|]
 exp_b = exp (b);
 Z = b / se_b;
 P = matrix (0, rows = D, cols = 1);
 parfor (i in 1:D) {
 	P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
 }

 # compute confidence intervals for b
 z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
 CI_l = b - se_b * z_alpha_2;
 CI_r = b - se_b + z_alpha_2;

 ######## SOME STATISTICS AND TESTS
 S = matrix(0, 6, 1);
 T = matrix(0, 3, 3);

 # no. of records
 S_str = "no. of records " + N;
 S[1, 1] = N;

 # no.of events
 S_str = append (S_str, "no. of events " + sum (E));
 S[2, 1] = sum (E);

 # log-likelihood
 loglik = -o;
 S_str = append (S_str, "loglik " + loglik + " ");
 S[3, 1] = loglik;

 # AIC = -2 * loglik + 2 * D
 AIC = -2 * loglik + 2 * D;
 S_str = append (S_str, "AIC " + AIC + " ");
 S[4, 1] = AIC;

 # Wald test
 wald_t = as.scalar (t(b) %*% H %*% b);
 wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
 T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
 T[1, 1] = wald_t;
 T[1, 2] = D;
 T[1, 3] = wald_p;

 # Likelihood ratio test
 lratio_t = 2 * o_init - 2 * o;
 lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
 T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");

 T[2, 1] = lratio_t;
 T[2, 2] = D;
 T[2, 3] = lratio_p;

 H0_inv = inv (H0);
 score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
 score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
 T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
 T[3, 1] = score_t;
 T[3, 2] = D;
 T[3, 3] = score_p;

 # Rsquare (Cox & Snell)
 Rsquare = 1 - exp (-lratio_t / N);
 Rsquare_max = 1 - exp (-2 * o_init / N);
 S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
 S[5, 1] = Rsquare;
 S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
 S[6, 1] = Rsquare_max;

 M = matrix (0, rows = D, cols = 7);
 M[,1] = b;
 M[,2] = exp_b;
 M[,3] = se_b;
 M[,4] = Z;
 M[,5] = P;
 M[,6] = CI_l;
 M[,7] = CI_r;

 write (M, fileM, format = fmtO);
 if (fileS != " ") {
 	write (S, fileS, format = fmtO);
 } else {
 	print (S_str);
 }
 if (fileT != " ") {
 	write (T, fileT, format = fmtO);
 } else {
 	print (T_str);
 }
 # needed for prediction
 write (RT, fileRT, format = fmtO);
 write (H_inv, fileCOV, format = fmtO);
 write (X_orig, fileXO, format = fmtO);


 ####### UDFS FOR TRUST REGION NEWTON METHOD

 ensure_trust_bound =
     function (double x, double a, double b, double c)
     return (double x_new, boolean is_violated)
 {
     if (a * x^2 + b * x + c > 0)
     {
         is_violated = TRUE;
         rad = sqrt (b ^ 2 - 4 * a * c);
         if (b >= 0) {
             x_new = - (2 * c) / (b + rad);
         } else {
             x_new = - (b - rad) / (2 * a);
         }
     } else {
         is_violated = FALSE;
         x_new = x;
     }
 }

 update_trust_bound =
     function (double delta,
               double sb_distance,
               double so_exact,
               double so_linear_approx,
               double so_quadratic_approx)
     return   (double delta)
 {
     sigma1 = 0.25;
     sigma2 = 0.5;
     sigma3 = 4.0;

     if (so_exact <= so_linear_approx) {
        alpha = sigma3;
     } else {
        alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
     }

     rho = so_exact / so_quadratic_approx;
     if (rho < 0.0001) {
         delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
     } else { if (rho < 0.25) {
         delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
     } else { if (rho < 0.75) {
         delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
     } else {
         delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
     }}}
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
	# The Breslow method is used for handling ties and the regression parameters
	# are computed using trust region newton method with conjugate gradient
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X String --- Location to read the input matrix X containing the survival data containing the following information
	# - 1: timestamps
	# - 2: whether an event occurred (1) or data is censored (0)
	# - 3: feature vectors
	# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
	# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model
	# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing
	# the start and end indices of the factors in X
	# - R[,1]: start indices
	# - R[,2]: end indices
	# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X;
	# in this case the start and end indices corresponding to the baseline level need to be the same;
	# if R is not provided by default all variables are considered to be continuous
	# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted
	# Cox model (the betas), their standard errors, confidence intervals, and P-values
	# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including
	# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare;
	# by default is standard output
	# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
	# by default is standard output
	# COV String --- Location to store the variance-covariance matrix of the betas
	# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X
	# XO String --- Location to store sorted input matrix by the timestamps
	# MF String --- Location to store column indices of X excluding the baseline factors if available
	# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
	# tol Double 0.000001 Tolerance ("epsilon")
	# moi Int 100 Max. number of outer (Newton) iterations
	# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
	# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
	# ---------------------------------------------------------------------------------------------
	# OUTPUT:
	# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
	# M[,1]: betas
	# M[,2]: exp(betas)
	# M[,3]: standard error of betas
	# M[,4]: Z
	# M[,5]: P-value
	# M[,6]: lower 100*(1-alpha)% confidence interval of betas
	# M[,7]: upper 100*(1-alpha)% confidence interval of betas
	#
	# Two matrices containing a summary of some statistics of the fitted model:
	# 1- File S with the following format
	# - row 1: no. of observations
	# - row 2: no. of events
	# - row 3: log-likelihood
	# - row 4: AIC
	# - row 5: Rsquare (Cox & Snell)
	# - row 6: max possible Rsquare
	# 2- File T with the following format
	# - row 1: Likelihood ratio test statistic, degree of freedom, P-value
	# - row 2: Wald test statistic, degree of freedom, P-value
	# - row 3: Score (log-rank) test statistic, degree of freedom, P-value
	#
	# Additionally, the following matrices are stored (needed for prediction)
	# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
	# 2- Matrix XO which is matrix X with sorted timestamps
	# 3- Variance-covariance matrix of the betas COV
	# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
	# -------------------------------------------------------------------------------------------
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemDS.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R
	# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT
	# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv

	fileX = $X;
	fileTE = $TE;
	fileRT = $RT;
	fileMF = $MF;
	fileM = $M;
	fileXO = $XO;
	fileCOV = $COV;

	# Default values of some parameters
	fileF = ifdef ($F, " "); # $F=" "
	fileR = ifdef ($R, " "); # $R=" "
	fileS = ifdef ($S, " "); # $S=" "
	fileT = ifdef ($T, " "); # $T=" "
	fmtO = ifdef ($fmt, "text"); # $fmt="text"
	alpha = ifdef ($alpha, 0.05); # $alpha=0.05
	tol = ifdef ($tol, 0.000001); # $tol=0.000001;
	maxiter = ifdef ($moi, 100); # $moi=100;
	maxinneriter = ifdef ($mii, 0); # $mii=0;

	X_orig = read (fileX);

	TE = read (fileTE);
	if (fileF != " ") {
	F = read (fileF);
	}

	######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET

	if (fileR != " ") { # factors available
	R = read (fileR);
	if (ncol (R) != 2) {
	stop ("Matrix R has wrong dimensions!");
	}
	print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");
	# identify baseline columns to be removed from X_orig
	col_sum = colSums (X_orig);
	col_seq = t (seq(1, ncol (X_orig)));
	parfor (i in 1:nrow (R), check = 0) {
	start_ind = as.scalar (R[i,1]);
	end_ind = as.scalar (R[i,2]);
	baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
	col_seq[,baseline_ind] = 0;
	}
	ones = matrix (1, rows = nrow (F), cols = 1);
	F_filter = table (ones, F, 1, ncol (X_orig));
	F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
	TE_F = t(cbind (t (TE), F_filter));
	} else if (fileF != " ") { # all features scale
	TE_F = t(cbind (t (TE), t(F)));
	} else { # no features available
	TE_F = TE;
	}

	write (TE_F, fileMF, format = fmtO);

	X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));

	######## RECODING TIMESTAMPS PRESERVING THE ORDER
	print ("RECODING TIMESTAMPS...");

	N = nrow (X_orig);
	X_orig = order (target = X_orig, by = 1);
	Idx = matrix (1, rows = N, cols = 1);
	num_timestamps = 1;
	if (N == 1) {
	RT = matrix (1, rows = 1, cols = 1);
	} else {
	Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
	num_timestamps = sum (Idx);
	A = removeEmpty (target = diag (Idx), margin = "cols");
	if (ncol (A) > 1) {
	A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
	B = cumsum (A);
	RT = B %*% seq(1, ncol(B));
	} else { # there is only one group
	RT = matrix (1, rows = N, cols = 1);
	}
	}
	E = X_orig[,2];

	print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");

	######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
	# b: the regression betas
	# o: loss function value
	# g: loss function gradient
	# H: loss function Hessian
	# sb: shift of b in one iteration
	# so: shift of o in one iteration
	# r: CG residual = H %*% sb + g
	# d: CG direction vector
	# Hd: = H %*% d
	# c: scalar coefficient in CG
	# delta: trust region size
	# tol: tolerance value
	# i: outer (Newton) iteration count
	# j: inner (CG) iteration count

	# computing initial coefficients b (all initialized to 0)
	if (ncol (X_orig) > 2) {
	X = X_orig[,3:ncol(X_orig)];
	D = ncol (X);
	zeros_D = matrix (0, rows = D, cols = 1);
	b = zeros_D;
	}
	d_r = aggregate (target = E, groups = RT, fn = "sum");
	e_r = aggregate (target = RT, groups = RT, fn = "count");

	# computing initial loss function value o
	num_distinct = nrow (d_r); # no. of distinct timestamps
	e_r_rev_agg = cumsum (rev(e_r));
	d_r_rev = rev(d_r);
	o = sum (d_r_rev * log (e_r_rev_agg));
	o_init = o;
	if (ncol (X_orig) < 3) {
	loglik = -o;
	S_str = "no. of records " + N + " loglik " + loglik;
	if (fileS != " ") {
	S = matrix(0, 6, 1);
	S[1, 1] = N;
	S[2, 1] = 0; # number of events
	S[3, 1] = loglik;
	S[4, 1] = -1; # AIC
	S[5, 1] = -1; # Rsquare
	S[6, 1] = -1; #Rsquare_max
	write (S, fileS, format = fmtO);
	} else {
	print (S_str);
	}
	stop ("No features are selected!");
	}

	# computing initial gradient g
	# part 1 g0_1
	g0_1 = - t (colSums (X * E)); # g_1
	# part 2 g0_2
	X_rev_agg = cumsum (rev(X));
	select = table (seq (1, num_distinct), e_r_rev_agg);
	X_agg = select %*% X_rev_agg;
	g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
	#
	g0 = g0_1 + g0_2;
	g = g0;

	# initialization for trust region Newton method
	delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
	initial_g2 = sum (g ^ 2);
	exit_g2 = initial_g2 * tol ^ 2;
	maxiter = 100;
	maxinneriter = min (D, 100);
	i = 0;
	sum_g2 = sum (g ^ 2);
	while (sum_g2 > exit_g2 & i < maxiter) {
	i = i + 1;
	sb = zeros_D;
	r = g;
	r2 = sum (r ^ 2);
	exit_r2 = 0.01 * r2;
	d = - r;
	trust_bound_reached = FALSE;
	j = 0;

	exp_Xb = exp (X %*% b);
	exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
	D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
	X_exp_Xb = X * exp_Xb;
	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

	while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
	j = j + 1;
	# computing Hessian times d (Hd)
	# part 1 Hd_1
	Xd = X %*% d;
	X_Xd_exp_Xb = X * (Xd * exp_Xb);
	X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
	X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;

	Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
	# part 2 Hd_2

	Xd_exp_Xb = Xd * exp_Xb;
	Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
	Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;

	Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
	Hd_2 = Hd_2_num / (D_r_rev ^ 2);

	Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));

	c = r2 / sum (d * Hd);
	[c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
	sb = sb + c * d;
	r = r + c * Hd;
	r2_new = sum (r ^ 2);
	d = - r + (r2_new / r2) * d;
	r2 = r2_new;
	}

	# computing loss change in 1 iteration (so)
	# part 1 so_1
	so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
	# part 2 so_2
	exp_Xbsb = exp (X %*% (b + sb));
	exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
	so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
	#
	so = so_1 + so_2;
	so = so - o;

	delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
	if (so < 0) {
	b = b + sb;
	o = o + so;
	# compute new gradient g
	exp_Xb = exp (X %*% b);
	exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
	X_exp_Xb = X * exp_Xb;
	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

	D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
	g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
	g = g0_1 + g_2;
	sum_g2 = sum (g ^ 2);
	}
	}

	if (sum_g2 > exit_g2 & i >= maxiter) {
	print ("Trust region Newton method did not converge!");
	}


	print ("COMPUTING HESSIAN...");

	H0 = matrix (0, rows = D, cols = D);
	H = matrix (0, rows = D, cols = D);

	X_exp_Xb_rev_2 = rev(X_exp_Xb);
	X_rev_2 = rev(X);

	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

	parfor (i in 1:D, check = 0) {
	Xi = X[,i];
	Xi_rev = rev(Xi);

	## ----------Start calculating H0--------------
	# part 1 H0_1
	Xi_X = X_rev_2[,i:D] * Xi_rev;
	Xi_X_rev_agg = cumsum (Xi_X);
	Xi_X_rev_agg = select %*% Xi_X_rev_agg;
	H0_1 = Xi_X_rev_agg / e_r_rev_agg;

	# part 2 H0_2
	Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
	Xi_agg_rev_agg = cumsum (rev(Xi_agg));
	H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
	H0_2 = H0_2_num / (e_r_rev_agg ^ 2);

	H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
	#-----------End calculating H0--------------------

	## ----------Start calculating H--------------
	# part 1 H_1
	Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
	Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
	Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
	H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;

	# part 2 H_2
	Xi_exp_Xb = exp_Xb * Xi;
	Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");

	Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
	H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
	H_2 = H_2_num / (D_r_rev ^ 2);
	H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
	#-----------End calculating H--------------------
	}
	H = H + t(H) - diag( diag (H));
	H0 = H0 + t(H0) - diag( diag (H0));


	# compute standard error for betas
	H_inv = inv (H);
	se_b = sqrt (diag (H_inv));

	# compute exp(b), Z, Pr[>\|Z\|]
	exp_b = exp (b);
	Z = b / se_b;
	P = matrix (0, rows = D, cols = 1);
	parfor (i in 1:D) {
	P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
	}

	# compute confidence intervals for b
	z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
	CI_l = b - se_b * z_alpha_2;
	CI_r = b - se_b + z_alpha_2;

	######## SOME STATISTICS AND TESTS
	S = matrix(0, 6, 1);
	T = matrix(0, 3, 3);

	# no. of records
	S_str = "no. of records " + N;
	S[1, 1] = N;

	# no.of events
	S_str = append (S_str, "no. of events " + sum (E));
	S[2, 1] = sum (E);

	# log-likelihood
	loglik = -o;
	S_str = append (S_str, "loglik " + loglik + " ");
	S[3, 1] = loglik;

	# AIC = -2 * loglik + 2 * D
	AIC = -2 * loglik + 2 * D;
	S_str = append (S_str, "AIC " + AIC + " ");
	S[4, 1] = AIC;

	# Wald test
	wald_t = as.scalar (t(b) %% H %% b);
	wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
	T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
	T[1, 1] = wald_t;
	T[1, 2] = D;
	T[1, 3] = wald_p;

	# Likelihood ratio test
	lratio_t = 2 * o_init - 2 * o;
	lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
	T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");

	T[2, 1] = lratio_t;
	T[2, 2] = D;
	T[2, 3] = lratio_p;

	H0_inv = inv (H0);
	score_t = as.scalar (t (g0) %% H0_inv %% g0);
	score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
	T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
	T[3, 1] = score_t;
	T[3, 2] = D;
	T[3, 3] = score_p;

	# Rsquare (Cox & Snell)
	Rsquare = 1 - exp (-lratio_t / N);
	Rsquare_max = 1 - exp (-2 * o_init / N);
	S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
	S[5, 1] = Rsquare;
	S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
	S[6, 1] = Rsquare_max;

	M = matrix (0, rows = D, cols = 7);
	M[,1] = b;
	M[,2] = exp_b;
	M[,3] = se_b;
	M[,4] = Z;
	M[,5] = P;
	M[,6] = CI_l;
	M[,7] = CI_r;

	write (M, fileM, format = fmtO);
	if (fileS != " ") {
	write (S, fileS, format = fmtO);
	} else {
	print (S_str);
	}
	if (fileT != " ") {
	write (T, fileT, format = fmtO);
	} else {
	print (T_str);
	}
	# needed for prediction
	write (RT, fileRT, format = fmtO);
	write (H_inv, fileCOV, format = fmtO);
	write (X_orig, fileXO, format = fmtO);


	####### UDFS FOR TRUST REGION NEWTON METHOD

	ensure_trust_bound =
	function (double x, double a, double b, double c)
	return (double x_new, boolean is_violated)
	{
	if (a * x^2 + b * x + c > 0)
	{
	is_violated = TRUE;
	rad = sqrt (b ^ 2 - 4 * a * c);
	if (b >= 0) {
	x_new = - (2 * c) / (b + rad);
	} else {
	x_new = - (b - rad) / (2 * a);
	}
	} else {
	is_violated = FALSE;
	x_new = x;
	}
	}

	update_trust_bound =
	function (double delta,
	double sb_distance,
	double so_exact,
	double so_linear_approx,
	double so_quadratic_approx)
	return (double delta)
	{
	sigma1 = 0.25;
	sigma2 = 0.5;
	sigma3 = 4.0;

	if (so_exact <= so_linear_approx) {
	alpha = sigma3;
	} else {
	alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
	}

	rho = so_exact / so_quadratic_approx;
	if (rho < 0.0001) {
	delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
	} else { if (rho < 0.25) {
	delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
	} else { if (rho < 0.75) {
	delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
	} else {
	delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
	}}}
	}