blob: b3fe29fe0a8d883a6212b82a19b53135c9de00e7 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
# The Breslow method is used for handling ties and the regression parameters
# are computed using trust region newton method with conjugate gradient
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the input matrix X containing the survival data containing the following information
# - 1: timestamps
# - 2: whether an event occurred (1) or data is censored (0)
# - 3: feature vectors
# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model
# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing
# the start and end indices of the factors in X
# - R[,1]: start indices
# - R[,2]: end indices
# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X;
# in this case the start and end indices corresponding to the baseline level need to be the same;
# if R is not provided by default all variables are considered to be continuous
# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted
# Cox model (the betas), their standard errors, confidence intervals, and P-values
# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including
# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare;
# by default is standard output
# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
# by default is standard output
# COV String --- Location to store the variance-covariance matrix of the betas
# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X
# XO String --- Location to store sorted input matrix by the timestamps
# MF String --- Location to store column indices of X excluding the baseline factors if available
# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
# tol Double 0.000001 Tolerance ("epsilon")
# moi Int 100 Max. number of outer (Newton) iterations
# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
# M[,1]: betas
# M[,2]: exp(betas)
# M[,3]: standard error of betas
# M[,4]: Z
# M[,5]: P-value
# M[,6]: lower 100*(1-alpha)% confidence interval of betas
# M[,7]: upper 100*(1-alpha)% confidence interval of betas
#
# Two log files containing a summary of some statistics of the fitted model:
# 1- File S with the following format
# - line 1: no. of observations
# - line 2: no. of events
# - line 3: log-likelihood
# - line 4: AIC
# - line 5: Rsquare (Cox & Snell)
# - line 6: max possible Rsquare
# 2- File T with the following format
# - line 1: Likelihood ratio test statistic, degree of freedom, P-value
# - line 2: Wald test statistic, degree of freedom, P-value
# - line 3: Score (log-rank) test statistic, degree of freedom, P-value
#
# Additionally, the following matrices are stored (needed for prediction)
# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
# 2- Matrix XO which is matrix X with sorted timestamps
# 3- Variance-covariance matrix of the betas COV
# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R
# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT
# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv
fileX = $X;
fileTE = $TE;
fileRT = $RT;
fileMF = $MF;
fileM = $M;
fileXO = $XO;
fileCOV = $COV;
# Default values of some parameters
fileF = ifdef ($F, " "); # $F=" "
fileR = ifdef ($R, " "); # $R=" "
fileS = ifdef ($S, " "); # $S=" "
fileT = ifdef ($T, " "); # $T=" "
fmtO = ifdef ($fmt, "text"); # $fmt="text"
alpha = ifdef ($alpha, 0.05); # $alpha=0.05
tol = ifdef ($tol, 0.000001); # $tol=0.000001;
maxiter = ifdef ($moi, 100); # $moi=100;
maxinneriter = ifdef ($mii, 0); # $mii=0;
X_orig = read (fileX);
TE = read (fileTE);
if (fileF != " ") {
F = read (fileF);
}
######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET
if (fileR != " ") { # factors available
R = read (fileR);
if (ncol (R) != 2) {
stop ("Matrix R has wrong dimensions!");
}
print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");
# identify baseline columns to be removed from X_orig
col_sum = colSums (X_orig);
col_seq = t (seq(1, ncol (X_orig)));
parfor (i in 1:nrow (R), check = 0) {
start_ind = as.scalar (R[i,1]);
end_ind = as.scalar (R[i,2]);
baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
col_seq[,baseline_ind] = 0;
}
ones = matrix (1, rows = nrow (F), cols = 1);
F_filter = table (ones, F, 1, ncol (X_orig));
F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
TE_F = t(append (t (TE), F_filter));
} else if (fileF != " ") { # all features scale
TE_F = t(append (t (TE), t(F)));
} else { # no features available
TE_F = TE;
}
write (TE_F, fileMF, format = fmtO);
X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));
######## RECODING TIMESTAMPS PRESERVING THE ORDER
print ("RECODING TIMESTAMPS...");
N = nrow (X_orig);
X_orig = order (target = X_orig, by = 1);
Idx = matrix (1, rows = N, cols = 1);
num_timestamps = 1;
if (N == 1) {
RT = matrix (1, rows = 1, cols = 1);
} else {
Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
num_timestamps = sum (Idx);
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
RT = B %*% seq(1, ncol(B));
} else { # there is only one group
RT = matrix (1, rows = N, cols = 1);
}
}
E = X_orig[,2];
print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
# b: the regression betas
# o: loss function value
# g: loss function gradient
# H: loss function Hessian
# sb: shift of b in one iteration
# so: shift of o in one iteration
# r: CG residual = H %*% sb + g
# d: CG direction vector
# Hd: = H %*% d
# c: scalar coefficient in CG
# delta: trust region size
# tol: tolerance value
# i: outer (Newton) iteration count
# j: inner (CG) iteration count
# computing initial coefficients b (all initialized to 0)
if (ncol (X_orig) > 2) {
X = X_orig[,3:ncol(X_orig)];
D = ncol (X);
zeros_D = matrix (0, rows = D, cols = 1);
b = zeros_D;
}
d_r = aggregate (target = E, groups = RT, fn = "sum");
e_r = aggregate (target = RT, groups = RT, fn = "count");
# computing initial loss function value o
num_distinct = nrow (d_r); # no. of distinct timestamps
e_r_rev_agg = cumsum (rev(e_r));
d_r_rev = rev(d_r);
o = sum (d_r_rev * log (e_r_rev_agg));
o_init = o;
if (ncol (X_orig) < 3) {
loglik = -o;
S_str = "no. of records " + N + " loglik " + loglik;
if (fileS != " ") {
write (S_str, fileS, format = fmtO);
} else {
print (S_str);
}
stop ("No features are selected!");
}
# computing initial gradient g
# part 1 g0_1
g0_1 = - t (colSums (X * E)); # g_1
# part 2 g0_2
X_rev_agg = cumsum (rev(X));
select = table (seq (1, num_distinct), e_r_rev_agg);
X_agg = select %*% X_rev_agg;
g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
#
g0 = g0_1 + g0_2;
g = g0;
# initialization for trust region Newton method
delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
initial_g2 = sum (g ^ 2);
exit_g2 = initial_g2 * tol ^ 2;
maxiter = 100;
maxinneriter = min (D, 100);
i = 0;
sum_g2 = sum (g ^ 2);
while (sum_g2 > exit_g2 & i < maxiter) {
i = i + 1;
sb = zeros_D;
r = g;
r2 = sum (r ^ 2);
exit_r2 = 0.01 * r2;
d = - r;
trust_bound_reached = FALSE;
j = 0;
exp_Xb = exp (X %*% b);
exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
X_exp_Xb = X * exp_Xb;
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
j = j + 1;
# computing Hessian times d (Hd)
# part 1 Hd_1
Xd = X %*% d;
X_Xd_exp_Xb = X * (Xd * exp_Xb);
X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
# part 2 Hd_2
Xd_exp_Xb = Xd * exp_Xb;
Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
Hd_2 = Hd_2_num / (D_r_rev ^ 2);
Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
c = r2 / sum (d * Hd);
[c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
sb = sb + c * d;
r = r + c * Hd;
r2_new = sum (r ^ 2);
d = - r + (r2_new / r2) * d;
r2 = r2_new;
}
# computing loss change in 1 iteration (so)
# part 1 so_1
so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
# part 2 so_2
exp_Xbsb = exp (X %*% (b + sb));
exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
#
so = so_1 + so_2;
so = so - o;
delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
if (so < 0) {
b = b + sb;
o = o + so;
# compute new gradient g
exp_Xb = exp (X %*% b);
exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
X_exp_Xb = X * exp_Xb;
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
g = g0_1 + g_2;
sum_g2 = sum (g ^ 2);
}
}
if (sum_g2 > exit_g2 & i >= maxiter) {
print ("Trust region Newton method did not converge!");
}
print ("COMPUTING HESSIAN...");
H0 = matrix (0, rows = D, cols = D);
H = matrix (0, rows = D, cols = D);
X_exp_Xb_rev_2 = rev(X_exp_Xb);
X_rev_2 = rev(X);
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
parfor (i in 1:D, check = 0) {
Xi = X[,i];
Xi_rev = rev(Xi);
## ----------Start calculating H0--------------
# part 1 H0_1
Xi_X = X_rev_2[,i:D] * Xi_rev;
Xi_X_rev_agg = cumsum (Xi_X);
Xi_X_rev_agg = select %*% Xi_X_rev_agg;
H0_1 = Xi_X_rev_agg / e_r_rev_agg;
# part 2 H0_2
Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
Xi_agg_rev_agg = cumsum (rev(Xi_agg));
H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
#-----------End calculating H0--------------------
## ----------Start calculating H--------------
# part 1 H_1
Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
# part 2 H_2
Xi_exp_Xb = exp_Xb * Xi;
Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
H_2 = H_2_num / (D_r_rev ^ 2);
H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
#-----------End calculating H--------------------
}
H = H + t(H) - diag( diag (H));
H0 = H0 + t(H0) - diag( diag (H0));
# compute standard error for betas
H_inv = inv (H);
se_b = sqrt (diag (H_inv));
# compute exp(b), Z, Pr[>|Z|]
exp_b = exp (b);
Z = b / se_b;
P = matrix (0, rows = D, cols = 1);
parfor (i in 1:D) {
P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
}
# compute confidence intervals for b
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
CI_l = b - se_b * z_alpha_2;
CI_r = b - se_b + z_alpha_2;
######## SOME STATISTICS AND TESTS
# no. of records
S_str = "no. of records " + N;
# no.of events
S_str = append (S_str, "no. of events " + sum (E));
# log-likelihood
loglik = -o;
S_str = append (S_str, "loglik " + loglik + " ");
# AIC = -2 * loglik + 2 * D
AIC = -2 * loglik + 2 * D;
S_str = append (S_str, "AIC " + AIC + " ");
# Wald test
wald_t = as.scalar (t(b) %*% H %*% b);
wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
# Likelihood ratio test
lratio_t = 2 * o_init - 2 * o;
lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");
H0_inv = inv (H0);
score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
# Rsquare (Cox & Snell)
Rsquare = 1 - exp (-lratio_t / N);
Rsquare_max = 1 - exp (-2 * o_init / N);
S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
M = matrix (0, rows = D, cols = 7);
M[,1] = b;
M[,2] = exp_b;
M[,3] = se_b;
M[,4] = Z;
M[,5] = P;
M[,6] = CI_l;
M[,7] = CI_r;
write (M, fileM, format = fmtO);
if (fileS != " ") {
write (S_str, fileS, format = fmtO);
} else {
print (S_str);
}
if (fileT != " ") {
write (T_str, fileT, format = fmtO);
} else {
print (T_str);
}
# needed for prediction
write (RT, fileRT, format = fmtO);
write (H_inv, fileCOV, format = fmtO);
write (X_orig, fileXO, format = fmtO);
####### UDFS FOR TRUST REGION NEWTON METHOD
ensure_trust_bound =
function (double x, double a, double b, double c)
return (double x_new, boolean is_violated)
{
if (a * x^2 + b * x + c > 0)
{
is_violated = TRUE;
rad = sqrt (b ^ 2 - 4 * a * c);
if (b >= 0) {
x_new = - (2 * c) / (b + rad);
} else {
x_new = - (b - rad) / (2 * a);
}
} else {
is_violated = FALSE;
x_new = x;
}
}
update_trust_bound =
function (double delta,
double sb_distance,
double so_exact,
double so_linear_approx,
double so_quadratic_approx)
return (double delta)
{
sigma1 = 0.25;
sigma2 = 0.5;
sigma3 = 4.0;
if (so_exact <= so_linear_approx) {
alpha = sigma3;
} else {
alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
}
rho = so_exact / so_quadratic_approx;
if (rho < 0.0001) {
delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
} else { if (rho < 0.25) {
delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
} else { if (rho < 0.75) {
delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
} else {
delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
}}}
}