scripts/builtin/cox.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
 # The Breslow method is used for handling ties and the regression parameters
 # are computed using trust region newton method with conjugate gradient
 #
 # INPUT   PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME    TYPE     DEFAULT    MEANING
 # ---------------------------------------------------------------------------------------------
 # X       Matrix   ---        Location to read the input matrix X containing the survival data
 #                             containing the following information
 #                             1: timestamps
 #                             2: whether an event occurred (1) or data is censored (0)
 #                             3: feature vectors
 # TE      Matrix   ---        Column indices of X as a column vector which contain timestamp
 #                             (first row) and event information (second row)
 # F       Matrix   ---        Column indices of X as a column vector which are to be used for
 #                             fitting the Cox model
 # R       Matrix   ---        If factors (categorical variables) are available in the input matrix
 #                             X, location to read matrix R containing the start and end indices of
 #                             the factors in X
 #                             R[,1]: start indices
 #                             R[,2]: end indices
 #                             Alternatively, user can specify the indices of the baseline level of
 #                             each factor which needs to be removed from X; in this case the start
 #                             and end indices corresponding to the baseline level need to be the same;
 #                             if R is not provided by default all variables are considered to be continuous
 # alpha   Double   0.05       Parameter to compute a 100*(1-alpha)% confidence interval for the betas
 # tol     Double   0.000001   Tolerance ("epsilon")
 # moi     Int      100        Max. number of outer (Newton) iterations
 # mii     Int      0          Max. number of inner (conjugate gradient) iterations, 0 = no max
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
 #   M[,1]: betas
 #   M[,2]: exp(betas)
 #   M[,3]: standard error of betas
 #   M[,4]: Z
 #   M[,5]: P-value
 #   M[,6]: lower 100*(1-alpha)% confidence interval of betas
 #   M[,7]: upper 100*(1-alpha)% confidence interval of betas
 #
 # Two matrices containing a summary of some statistics of the fitted model:
 # 1 - File S with the following format
 #   - row 1: no. of observations
 #   - row 2: no. of events
 #   - row 3: log-likelihood
 #   - row 4: AIC
 #   - row 5: Rsquare (Cox & Snell)
 #   - row 6: max possible Rsquare
 # 2 - File T with the following format
 #   - row 1: Likelihood ratio test statistic, degree of freedom, P-value
 #   - row 2: Wald test statistic, degree of freedom, P-value
 #   - row 3: Score (log-rank) test statistic, degree of freedom, P-value
 #
 # Additionally, the following matrices are stored (needed for prediction)
 # 1- A column matrix RT that contains the order-preserving recoded timestamps from X
 # 2- Matrix XO which is matrix X with sorted timestamps
 # 3- Variance-covariance matrix of the betas COV
 # 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
 # -------------------------------------------------------------------------------------------

 m_cox = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] F, Matrix[Double] R,
     Double alpha = 0.05, Double tol = 0.000001, Int moi = 100, Int mii = 0)
   return (Matrix[Double] M, Matrix[Double] S, Matrix[Double] T, Matrix[Double] COV, Matrix[Double] RT, Matrix[Double] XO) {

   X_orig = X;
   R_1_1 = as.scalar(R[1,1]);

   ######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET

   if (R_1_1 != 0) { # factors available
     if (ncol (R) != 2) {
       stop ("Matrix R has wrong dimensions!");
     }
     print ("REMOVING BASELINE LEVEL OF EACH FACTOR...");
     # identify baseline columns to be removed from X_orig
     col_sum = colSums (X_orig);
     col_seq = t (seq(1, ncol (X_orig)));

     parfor (i in 1:nrow (R), check = 0) {
       start_ind = as.scalar (R[i,1]);
       end_ind = as.scalar (R[i,2]);
       baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
       col_seq[,baseline_ind] = 0;
     }
     ones = matrix (1, rows = nrow (F), cols = 1);
     F_filter = table (ones, F, 1, ncol (X_orig));
     F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
     TE_F = t(cbind (t (TE), F_filter));
   } else if (ncol(F) > 0 & nrow(F) > 0) { # all features scale
     TE_F = t(cbind (t (TE), t(F)));
   } else { # no features available
     TE_F = TE;
   }

   X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));

   ######## RECODING TIMESTAMPS PRESERVING THE ORDER
   print ("RECODING TIMESTAMPS...");

   N = nrow (X_orig);
   X_orig = order (target = X_orig, by = 1);
   Idx = matrix (1, rows = N, cols = 1);
   num_timestamps = 1;
   if (N == 1)
     RT = matrix (1, rows = 1, cols = 1);
   else {
     Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
     num_timestamps = sum (Idx);
     A = removeEmpty (target = diag (Idx), margin = "cols");
     if (ncol (A) > 1) {
       A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
       B = cumsum (A);
       RT = B %*% seq(1, ncol(B));
     } else { # there is only one group
       RT = matrix (1, rows = N, cols = 1);
     }
   }
   E = X_orig[,2];

   print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");

   ######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
   #  b: the regression betas
   #  o: loss function value
   #  g: loss function gradient
   #  H: loss function Hessian
   # sb: shift of b in one iteration
   # so: shift of o in one iteration
   #  r: CG residual = H %*% sb + g
   #  d: CG direction vector
   # Hd: = H %*% d
   #  c: scalar coefficient in CG
   # delta: trust region size
   # tol: tolerance value
   #  i: outer (Newton) iteration count
   #  j: inner (CG) iteration count

   # computing initial coefficients b (all initialised to 0)
   if (ncol (X_orig) > 2) {
     X = X_orig[,3:ncol(X_orig)];
     D = ncol (X);
     zeros_D = matrix (0, rows = D, cols = 1);
     b = zeros_D;
   }
   d_r = aggregate (target = E, groups = RT, fn = "sum");
   e_r = aggregate (target = RT, groups = RT, fn = "count");

   # computing initial loss function value o
   num_distinct = nrow (d_r); # no. of distinct timestamps
   e_r_rev_agg = cumsum (rev(e_r));
   d_r_rev = rev(d_r);
   o = sum (d_r_rev * log (e_r_rev_agg));
   o_init = o;

   if (ncol (X_orig) < 3) {
     loglik = -o;
     S = matrix(0, 6, 1);
     S[1, 1] = N;
     S[2, 1] = 0; # number of events
     S[3, 1] = loglik;
     S[4, 1] = -1; # AIC
     S[5, 1] = -1; # Rsquare
     S[6, 1] = -1; #Rsquare_max

     stop ("No features are selected!");
   }

   # computing initial gradient g
   # part 1 g0_1
   g0_1 = - t (colSums (X * E)); # g_1
   # part 2 g0_2
   X_rev_agg = cumsum (rev(X));
   select = table (seq (1, num_distinct), e_r_rev_agg);
   X_agg = select %*% X_rev_agg;
   g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
   g0 = g0_1 + g0_2;
   g = g0;

   # initialization for trust region Newton method
   delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
   initial_g2 = sum (g ^ 2);
   exit_g2 = initial_g2 * tol ^ 2;
   maxiter = 100;
   maxinneriter = min (D, 100);
   i = 0;
   sum_g2 = sum (g ^ 2);

   while (sum_g2 > exit_g2 & i < maxiter) {
     i = i + 1;
     sb = zeros_D;
     r = g;
     r2 = sum (r ^ 2);
     exit_r2 = 0.01 * r2;
     d = - r;
     trust_bound_reached = FALSE;
     j = 0;

     exp_Xb = exp (X %*% b);
     exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
     D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
     X_exp_Xb = X * exp_Xb;
     X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
     X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

     while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
       j = j + 1;
       # computing Hessian times d (Hd)
       # part 1 Hd_1
       Xd = X %*% d;
       X_Xd_exp_Xb = X * (Xd * exp_Xb);
       X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
       X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;

       Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
       # part 2 Hd_2
       Xd_exp_Xb = Xd * exp_Xb;
       Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
       Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;

       Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
       Hd_2 = Hd_2_num / (D_r_rev ^ 2);

       Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));

       c = r2 / sum (d * Hd);
       [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
       sb = sb + c * d;
       r = r + c * Hd;
       r2_new = sum (r ^ 2);
       d = - r + (r2_new / r2) * d;
       r2 = r2_new;
     }

     # computing loss change in 1 iteration (so)
     # part 1 so_1
     so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
     # part 2 so_2
     exp_Xbsb = exp (X %*% (b + sb));
     exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
     so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));

     so = so_1 + so_2;
     so = so - o;

     delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
     if (so < 0) {
       b = b + sb;
       o = o + so;
       # compute new gradient g
       exp_Xb = exp (X %*% b);
       exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
       X_exp_Xb = X * exp_Xb;
       X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
       X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

       D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
       g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
       g = g0_1 + g_2;
       sum_g2 = sum (g ^ 2);
     }
   }

   if (sum_g2 > exit_g2 & i >= maxiter) {
     print ("Trust region Newton method did not converge!");
   }

   print ("COMPUTING HESSIAN...");

   H0 = matrix (0, rows = D, cols = D);
   H = matrix (0, rows = D, cols = D);

   X_exp_Xb_rev_2 = rev(X_exp_Xb);
   X_rev_2 = rev(X);

   X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
   X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

   parfor (i in 1:D, check = 0) {
     Xi = X[,i];
     Xi_rev = rev(Xi);

     ## ----------Start calculating H0--------------
     # part 1 H0_1
     Xi_X = X_rev_2[,i:D] * Xi_rev;
     Xi_X_rev_agg = cumsum (Xi_X);
     Xi_X_rev_agg = select %*% Xi_X_rev_agg;
     H0_1 = Xi_X_rev_agg / e_r_rev_agg;
     # part 2 H0_2
     Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
     Xi_agg_rev_agg = cumsum (rev(Xi_agg));
     H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
     H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
     H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);

     #-----------End calculating H0--------------------

     ## ----------Start calculating H--------------
     # part 1 H_1
     Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
     Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
     Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
     H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
     # part 2 H_2
     Xi_exp_Xb = exp_Xb * Xi;
     Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
     Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
     H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
     H_2 = H_2_num / (D_r_rev ^ 2);
     H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
     #-----------End calculating H--------------------
   }

   H = H + t(H) - diag( diag (H));
   H0 = H0 + t(H0) - diag( diag (H0));

   # compute standard error for betas
   H_inv = inv (H);
   se_b = sqrt (diag (H_inv));

   # compute exp(b), Z, Pr[>|Z|]
   P = matrix (0, rows = D, cols = 1);
   exp_b = exp (b);
   Z = b / se_b;

   for (i in 1:D) {
     Z_i_1_scalar = abs (as.scalar (Z[i,1]));
     #FIXME: enable after test setup has been fixed
     #P[i,1] = cdf (target = Z_i_1_scalar, dist = "normal");
   }

   # compute confidence intervals for b
   z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
   CI_l = b - se_b * z_alpha_2;
   CI_r = b - se_b + z_alpha_2;

   ######## SOME STATISTICS AND TESTS
   S = matrix(0, 6, 1);

   # no. of records
   S[1, 1] = N;

   # no.of events
   S[2, 1] = sum (E);

   # log-likelihood
   loglik = -o;
   S[3, 1] = loglik;

   # AIC
   AIC = -2 * loglik + 2 * D;
   S[4, 1] = AIC;

   # Likelihood ratio test
   lratio_t = 2 * o_init - 2 * o;
   lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);

   # Rsquare (Cox & Snell)
   Rsquare = 1 - exp (-lratio_t / N);
   Rsquare_max = 1 - exp (-2 * o_init / N);
   S[5, 1] = Rsquare;
   S[6, 1] = Rsquare_max;

   T = matrix(0, 3, 3);
   # Wald test
   wald_t = as.scalar (t(b) %*% H %*% b);
   wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
   T[1, 1] = wald_t;
   T[1, 2] = D;
   T[1, 3] = wald_p;

   # Likelihood ratio test
   T[2, 1] = lratio_t;
   T[2, 2] = D;
   T[2, 3] = lratio_p;

   H0_inv = inv (H0);
   score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
   score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
 #  T[3, 1] = score_t;
   T[3, 2] = D;
 #  T[3, 3] = score_p;

   M = matrix (1, rows = D, cols = 7);
   M[, 1] = b;
   M[, 2] = exp_b;
 #  M[, 3] = se_b;
 #  M[, 4] = Z;
 #  M[, 5] = P;
 #  M[, 6] = CI_l;
 #  M[, 7] = CI_r;

   COV = H;
   XO = X_orig;
 }

 ensure_trust_bound = function (double x, double a, double b, double c)
   return (double x_new, boolean is_violated)
 {
   if (a * x^2 + b * x + c > 0) {
     is_violated = TRUE;
     rad = sqrt (b ^ 2 - 4 * a * c);
     if (b >= 0)
       x_new = - (2 * c) / (b + rad);
     else
       x_new = - (b - rad) / (2 * a);
   } else {
     is_violated = FALSE;
     x_new = x;
   }
 }

 update_trust_bound = function (double delta, double sb_distance,
   double so_exact, double so_linear_approx, double so_quadratic_approx)
   return (double delta)
 {
   sigma1 = 0.25;
   sigma2 = 0.5;
   sigma3 = 4.0;

   if (so_exact <= so_linear_approx)
     alpha = sigma3;
   else
     alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));

   rho = so_exact / so_quadratic_approx;
   if (rho < 0.0001) {
     delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
   } else {
     if (rho < 0.25) {
       delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
     } else {
       if (rho < 0.75) {
       delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
       } else {
         delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
       }
     }
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
	# The Breslow method is used for handling ties and the regression parameters
	# are computed using trust region newton method with conjugate gradient
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Matrix --- Location to read the input matrix X containing the survival data
	# containing the following information
	# 1: timestamps
	# 2: whether an event occurred (1) or data is censored (0)
	# 3: feature vectors
	# TE Matrix --- Column indices of X as a column vector which contain timestamp
	# (first row) and event information (second row)
	# F Matrix --- Column indices of X as a column vector which are to be used for
	# fitting the Cox model
	# R Matrix --- If factors (categorical variables) are available in the input matrix
	# X, location to read matrix R containing the start and end indices of
	# the factors in X
	# R[,1]: start indices
	# R[,2]: end indices
	# Alternatively, user can specify the indices of the baseline level of
	# each factor which needs to be removed from X; in this case the start
	# and end indices corresponding to the baseline level need to be the same;
	# if R is not provided by default all variables are considered to be continuous
	# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
	# tol Double 0.000001 Tolerance ("epsilon")
	# moi Int 100 Max. number of outer (Newton) iterations
	# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
	# ---------------------------------------------------------------------------------------------
	# OUTPUT:
	# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
	# M[,1]: betas
	# M[,2]: exp(betas)
	# M[,3]: standard error of betas
	# M[,4]: Z
	# M[,5]: P-value
	# M[,6]: lower 100*(1-alpha)% confidence interval of betas
	# M[,7]: upper 100*(1-alpha)% confidence interval of betas
	#
	# Two matrices containing a summary of some statistics of the fitted model:
	# 1 - File S with the following format
	# - row 1: no. of observations
	# - row 2: no. of events
	# - row 3: log-likelihood
	# - row 4: AIC
	# - row 5: Rsquare (Cox & Snell)
	# - row 6: max possible Rsquare
	# 2 - File T with the following format
	# - row 1: Likelihood ratio test statistic, degree of freedom, P-value
	# - row 2: Wald test statistic, degree of freedom, P-value
	# - row 3: Score (log-rank) test statistic, degree of freedom, P-value
	#
	# Additionally, the following matrices are stored (needed for prediction)
	# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
	# 2- Matrix XO which is matrix X with sorted timestamps
	# 3- Variance-covariance matrix of the betas COV
	# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
	# -------------------------------------------------------------------------------------------

	m_cox = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] F, Matrix[Double] R,
	Double alpha = 0.05, Double tol = 0.000001, Int moi = 100, Int mii = 0)
	return (Matrix[Double] M, Matrix[Double] S, Matrix[Double] T, Matrix[Double] COV, Matrix[Double] RT, Matrix[Double] XO) {

	X_orig = X;
	R_1_1 = as.scalar(R[1,1]);

	######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET

	if (R_1_1 != 0) { # factors available
	if (ncol (R) != 2) {
	stop ("Matrix R has wrong dimensions!");
	}
	print ("REMOVING BASELINE LEVEL OF EACH FACTOR...");
	# identify baseline columns to be removed from X_orig
	col_sum = colSums (X_orig);
	col_seq = t (seq(1, ncol (X_orig)));

	parfor (i in 1:nrow (R), check = 0) {
	start_ind = as.scalar (R[i,1]);
	end_ind = as.scalar (R[i,2]);
	baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
	col_seq[,baseline_ind] = 0;
	}
	ones = matrix (1, rows = nrow (F), cols = 1);
	F_filter = table (ones, F, 1, ncol (X_orig));
	F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
	TE_F = t(cbind (t (TE), F_filter));
	} else if (ncol(F) > 0 & nrow(F) > 0) { # all features scale
	TE_F = t(cbind (t (TE), t(F)));
	} else { # no features available
	TE_F = TE;
	}

	X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));

	######## RECODING TIMESTAMPS PRESERVING THE ORDER
	print ("RECODING TIMESTAMPS...");

	N = nrow (X_orig);
	X_orig = order (target = X_orig, by = 1);
	Idx = matrix (1, rows = N, cols = 1);
	num_timestamps = 1;
	if (N == 1)
	RT = matrix (1, rows = 1, cols = 1);
	else {
	Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
	num_timestamps = sum (Idx);
	A = removeEmpty (target = diag (Idx), margin = "cols");
	if (ncol (A) > 1) {
	A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
	B = cumsum (A);
	RT = B %*% seq(1, ncol(B));
	} else { # there is only one group
	RT = matrix (1, rows = N, cols = 1);
	}
	}
	E = X_orig[,2];

	print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");

	######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
	# b: the regression betas
	# o: loss function value
	# g: loss function gradient
	# H: loss function Hessian
	# sb: shift of b in one iteration
	# so: shift of o in one iteration
	# r: CG residual = H %*% sb + g
	# d: CG direction vector
	# Hd: = H %*% d
	# c: scalar coefficient in CG
	# delta: trust region size
	# tol: tolerance value
	# i: outer (Newton) iteration count
	# j: inner (CG) iteration count

	# computing initial coefficients b (all initialised to 0)
	if (ncol (X_orig) > 2) {
	X = X_orig[,3:ncol(X_orig)];
	D = ncol (X);
	zeros_D = matrix (0, rows = D, cols = 1);
	b = zeros_D;
	}
	d_r = aggregate (target = E, groups = RT, fn = "sum");
	e_r = aggregate (target = RT, groups = RT, fn = "count");

	# computing initial loss function value o
	num_distinct = nrow (d_r); # no. of distinct timestamps
	e_r_rev_agg = cumsum (rev(e_r));
	d_r_rev = rev(d_r);
	o = sum (d_r_rev * log (e_r_rev_agg));
	o_init = o;

	if (ncol (X_orig) < 3) {
	loglik = -o;
	S = matrix(0, 6, 1);
	S[1, 1] = N;
	S[2, 1] = 0; # number of events
	S[3, 1] = loglik;
	S[4, 1] = -1; # AIC
	S[5, 1] = -1; # Rsquare
	S[6, 1] = -1; #Rsquare_max

	stop ("No features are selected!");
	}

	# computing initial gradient g
	# part 1 g0_1
	g0_1 = - t (colSums (X * E)); # g_1
	# part 2 g0_2
	X_rev_agg = cumsum (rev(X));
	select = table (seq (1, num_distinct), e_r_rev_agg);
	X_agg = select %*% X_rev_agg;
	g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
	g0 = g0_1 + g0_2;
	g = g0;

	# initialization for trust region Newton method
	delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
	initial_g2 = sum (g ^ 2);
	exit_g2 = initial_g2 * tol ^ 2;
	maxiter = 100;
	maxinneriter = min (D, 100);
	i = 0;
	sum_g2 = sum (g ^ 2);

	while (sum_g2 > exit_g2 & i < maxiter) {
	i = i + 1;
	sb = zeros_D;
	r = g;
	r2 = sum (r ^ 2);
	exit_r2 = 0.01 * r2;
	d = - r;
	trust_bound_reached = FALSE;
	j = 0;

	exp_Xb = exp (X %*% b);
	exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
	D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
	X_exp_Xb = X * exp_Xb;
	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

	while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
	j = j + 1;
	# computing Hessian times d (Hd)
	# part 1 Hd_1
	Xd = X %*% d;
	X_Xd_exp_Xb = X * (Xd * exp_Xb);
	X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
	X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;

	Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
	# part 2 Hd_2
	Xd_exp_Xb = Xd * exp_Xb;
	Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
	Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;

	Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
	Hd_2 = Hd_2_num / (D_r_rev ^ 2);

	Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));

	c = r2 / sum (d * Hd);
	[c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
	sb = sb + c * d;
	r = r + c * Hd;
	r2_new = sum (r ^ 2);
	d = - r + (r2_new / r2) * d;
	r2 = r2_new;
	}

	# computing loss change in 1 iteration (so)
	# part 1 so_1
	so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
	# part 2 so_2
	exp_Xbsb = exp (X %*% (b + sb));
	exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
	so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));

	so = so_1 + so_2;
	so = so - o;

	delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
	if (so < 0) {
	b = b + sb;
	o = o + so;
	# compute new gradient g
	exp_Xb = exp (X %*% b);
	exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
	X_exp_Xb = X * exp_Xb;
	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

	D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
	g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
	g = g0_1 + g_2;
	sum_g2 = sum (g ^ 2);
	}
	}

	if (sum_g2 > exit_g2 & i >= maxiter) {
	print ("Trust region Newton method did not converge!");
	}

	print ("COMPUTING HESSIAN...");

	H0 = matrix (0, rows = D, cols = D);
	H = matrix (0, rows = D, cols = D);

	X_exp_Xb_rev_2 = rev(X_exp_Xb);
	X_rev_2 = rev(X);

	X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
	X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;

	parfor (i in 1:D, check = 0) {
	Xi = X[,i];
	Xi_rev = rev(Xi);

	## ----------Start calculating H0--------------
	# part 1 H0_1
	Xi_X = X_rev_2[,i:D] * Xi_rev;
	Xi_X_rev_agg = cumsum (Xi_X);
	Xi_X_rev_agg = select %*% Xi_X_rev_agg;
	H0_1 = Xi_X_rev_agg / e_r_rev_agg;
	# part 2 H0_2
	Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
	Xi_agg_rev_agg = cumsum (rev(Xi_agg));
	H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
	H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
	H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);

	#-----------End calculating H0--------------------

	## ----------Start calculating H--------------
	# part 1 H_1
	Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
	Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
	Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
	H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
	# part 2 H_2
	Xi_exp_Xb = exp_Xb * Xi;
	Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
	Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
	H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
	H_2 = H_2_num / (D_r_rev ^ 2);
	H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
	#-----------End calculating H--------------------
	}

	H = H + t(H) - diag( diag (H));
	H0 = H0 + t(H0) - diag( diag (H0));

	# compute standard error for betas
	H_inv = inv (H);
	se_b = sqrt (diag (H_inv));

	# compute exp(b), Z, Pr[>\|Z\|]
	P = matrix (0, rows = D, cols = 1);
	exp_b = exp (b);
	Z = b / se_b;

	for (i in 1:D) {
	Z_i_1_scalar = abs (as.scalar (Z[i,1]));
	#FIXME: enable after test setup has been fixed
	#P[i,1] = cdf (target = Z_i_1_scalar, dist = "normal");
	}

	# compute confidence intervals for b
	z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
	CI_l = b - se_b * z_alpha_2;
	CI_r = b - se_b + z_alpha_2;

	######## SOME STATISTICS AND TESTS
	S = matrix(0, 6, 1);

	# no. of records
	S[1, 1] = N;

	# no.of events
	S[2, 1] = sum (E);

	# log-likelihood
	loglik = -o;
	S[3, 1] = loglik;

	# AIC
	AIC = -2 * loglik + 2 * D;
	S[4, 1] = AIC;

	# Likelihood ratio test
	lratio_t = 2 * o_init - 2 * o;
	lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);

	# Rsquare (Cox & Snell)
	Rsquare = 1 - exp (-lratio_t / N);
	Rsquare_max = 1 - exp (-2 * o_init / N);
	S[5, 1] = Rsquare;
	S[6, 1] = Rsquare_max;

	T = matrix(0, 3, 3);
	# Wald test
	wald_t = as.scalar (t(b) %% H %% b);
	wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
	T[1, 1] = wald_t;
	T[1, 2] = D;
	T[1, 3] = wald_p;

	# Likelihood ratio test
	T[2, 1] = lratio_t;
	T[2, 2] = D;
	T[2, 3] = lratio_p;

	H0_inv = inv (H0);
	score_t = as.scalar (t (g0) %% H0_inv %% g0);
	score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
	# T[3, 1] = score_t;
	T[3, 2] = D;
	# T[3, 3] = score_p;

	M = matrix (1, rows = D, cols = 7);
	M[, 1] = b;
	M[, 2] = exp_b;
	# M[, 3] = se_b;
	# M[, 4] = Z;
	# M[, 5] = P;
	# M[, 6] = CI_l;
	# M[, 7] = CI_r;

	COV = H;
	XO = X_orig;
	}

	ensure_trust_bound = function (double x, double a, double b, double c)
	return (double x_new, boolean is_violated)
	{
	if (a * x^2 + b * x + c > 0) {
	is_violated = TRUE;
	rad = sqrt (b ^ 2 - 4 * a * c);
	if (b >= 0)
	x_new = - (2 * c) / (b + rad);
	else
	x_new = - (b - rad) / (2 * a);
	} else {
	is_violated = FALSE;
	x_new = x;
	}
	}

	update_trust_bound = function (double delta, double sb_distance,
	double so_exact, double so_linear_approx, double so_quadratic_approx)
	return (double delta)
	{
	sigma1 = 0.25;
	sigma2 = 0.5;
	sigma3 = 4.0;

	if (so_exact <= so_linear_approx)
	alpha = sigma3;
	else
	alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));

	rho = so_exact / so_quadratic_approx;
	if (rho < 0.0001) {
	delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
	} else {
	if (rho < 0.25) {
	delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
	} else {
	if (rho < 0.75) {
	delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
	} else {
	delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
	}
	}
	}
	}