blob: cee4d7952034b055c37f963ce1eca044149a51c4 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
# The Breslow method is used for handling ties and the regression parameters
# are computed using trust region newton method with conjugate gradient
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Matrix --- Location to read the input matrix X containing the survival data
# containing the following information
# 1: timestamps
# 2: whether an event occurred (1) or data is censored (0)
# 3: feature vectors
# TE Matrix --- Column indices of X as a column vector which contain timestamp
# (first row) and event information (second row)
# F Matrix --- Column indices of X as a column vector which are to be used for
# fitting the Cox model
# R Matrix --- If factors (categorical variables) are available in the input matrix
# X, location to read matrix R containing the start and end indices of
# the factors in X
# R[,1]: start indices
# R[,2]: end indices
# Alternatively, user can specify the indices of the baseline level of
# each factor which needs to be removed from X; in this case the start
# and end indices corresponding to the baseline level need to be the same;
# if R is not provided by default all variables are considered to be continuous
# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
# tol Double 0.000001 Tolerance ("epsilon")
# moi Int 100 Max. number of outer (Newton) iterations
# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
# M[,1]: betas
# M[,2]: exp(betas)
# M[,3]: standard error of betas
# M[,4]: Z
# M[,5]: P-value
# M[,6]: lower 100*(1-alpha)% confidence interval of betas
# M[,7]: upper 100*(1-alpha)% confidence interval of betas
#
# Two matrices containing a summary of some statistics of the fitted model:
# 1 - File S with the following format
# - row 1: no. of observations
# - row 2: no. of events
# - row 3: log-likelihood
# - row 4: AIC
# - row 5: Rsquare (Cox & Snell)
# - row 6: max possible Rsquare
# 2 - File T with the following format
# - row 1: Likelihood ratio test statistic, degree of freedom, P-value
# - row 2: Wald test statistic, degree of freedom, P-value
# - row 3: Score (log-rank) test statistic, degree of freedom, P-value
#
# Additionally, the following matrices are stored (needed for prediction)
# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
# 2- Matrix XO which is matrix X with sorted timestamps
# 3- Variance-covariance matrix of the betas COV
# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
# -------------------------------------------------------------------------------------------
m_cox = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] F, Matrix[Double] R,
Double alpha = 0.05, Double tol = 0.000001, Int moi = 100, Int mii = 0)
return (Matrix[Double] M, Matrix[Double] S, Matrix[Double] T, Matrix[Double] COV, Matrix[Double] RT, Matrix[Double] XO) {
X_orig = X;
R_1_1 = as.scalar(R[1,1]);
######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET
if (R_1_1 != 0) { # factors available
if (ncol (R) != 2) {
stop ("Matrix R has wrong dimensions!");
}
print ("REMOVING BASELINE LEVEL OF EACH FACTOR...");
# identify baseline columns to be removed from X_orig
col_sum = colSums (X_orig);
col_seq = t (seq(1, ncol (X_orig)));
parfor (i in 1:nrow (R), check = 0) {
start_ind = as.scalar (R[i,1]);
end_ind = as.scalar (R[i,2]);
baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
col_seq[,baseline_ind] = 0;
}
ones = matrix (1, rows = nrow (F), cols = 1);
F_filter = table (ones, F, 1, ncol (X_orig));
F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
TE_F = t(cbind (t (TE), F_filter));
} else if (ncol(F) > 0 & nrow(F) > 0) { # all features scale
TE_F = t(cbind (t (TE), t(F)));
} else { # no features available
TE_F = TE;
}
X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));
######## RECODING TIMESTAMPS PRESERVING THE ORDER
print ("RECODING TIMESTAMPS...");
N = nrow (X_orig);
X_orig = order (target = X_orig, by = 1);
Idx = matrix (1, rows = N, cols = 1);
num_timestamps = 1;
if (N == 1)
RT = matrix (1, rows = 1, cols = 1);
else {
Idx[2:N,1] = (X_orig[1:(N - 1),1] != X_orig[2:N,1]);
num_timestamps = sum (Idx);
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
RT = B %*% seq(1, ncol(B));
} else { # there is only one group
RT = matrix (1, rows = N, cols = 1);
}
}
E = X_orig[,2];
print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
# b: the regression betas
# o: loss function value
# g: loss function gradient
# H: loss function Hessian
# sb: shift of b in one iteration
# so: shift of o in one iteration
# r: CG residual = H %*% sb + g
# d: CG direction vector
# Hd: = H %*% d
# c: scalar coefficient in CG
# delta: trust region size
# tol: tolerance value
# i: outer (Newton) iteration count
# j: inner (CG) iteration count
# computing initial coefficients b (all initialised to 0)
if (ncol (X_orig) > 2) {
X = X_orig[,3:ncol(X_orig)];
D = ncol (X);
zeros_D = matrix (0, rows = D, cols = 1);
b = zeros_D;
}
d_r = aggregate (target = E, groups = RT, fn = "sum");
e_r = aggregate (target = RT, groups = RT, fn = "count");
# computing initial loss function value o
num_distinct = nrow (d_r); # no. of distinct timestamps
e_r_rev_agg = cumsum (rev(e_r));
d_r_rev = rev(d_r);
o = sum (d_r_rev * log (e_r_rev_agg));
o_init = o;
if (ncol (X_orig) < 3) {
loglik = -o;
S = matrix(0, 6, 1);
S[1, 1] = N;
S[2, 1] = 0; # number of events
S[3, 1] = loglik;
S[4, 1] = -1; # AIC
S[5, 1] = -1; # Rsquare
S[6, 1] = -1; #Rsquare_max
stop ("No features are selected!");
}
# computing initial gradient g
# part 1 g0_1
g0_1 = - t (colSums (X * E)); # g_1
# part 2 g0_2
X_rev_agg = cumsum (rev(X));
select = table (seq (1, num_distinct), e_r_rev_agg);
X_agg = select %*% X_rev_agg;
g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
g0 = g0_1 + g0_2;
g = g0;
# initialization for trust region Newton method
delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
initial_g2 = sum (g ^ 2);
exit_g2 = initial_g2 * tol ^ 2;
maxiter = 100;
maxinneriter = min (D, 100);
i = 0;
sum_g2 = sum (g ^ 2);
while (sum_g2 > exit_g2 & i < maxiter) {
i = i + 1;
sb = zeros_D;
r = g;
r2 = sum (r ^ 2);
exit_r2 = 0.01 * r2;
d = - r;
trust_bound_reached = FALSE;
j = 0;
exp_Xb = exp (X %*% b);
exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
X_exp_Xb = X * exp_Xb;
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
j = j + 1;
# computing Hessian times d (Hd)
# part 1 Hd_1
Xd = X %*% d;
X_Xd_exp_Xb = X * (Xd * exp_Xb);
X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
# part 2 Hd_2
Xd_exp_Xb = Xd * exp_Xb;
Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
Hd_2 = Hd_2_num / (D_r_rev ^ 2);
Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
c = r2 / sum (d * Hd);
[c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
sb = sb + c * d;
r = r + c * Hd;
r2_new = sum (r ^ 2);
d = - r + (r2_new / r2) * d;
r2 = r2_new;
}
# computing loss change in 1 iteration (so)
# part 1 so_1
so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
# part 2 so_2
exp_Xbsb = exp (X %*% (b + sb));
exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
so = so_1 + so_2;
so = so - o;
delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
if (so < 0) {
b = b + sb;
o = o + so;
# compute new gradient g
exp_Xb = exp (X %*% b);
exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
X_exp_Xb = X * exp_Xb;
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
g = g0_1 + g_2;
sum_g2 = sum (g ^ 2);
}
}
if (sum_g2 > exit_g2 & i >= maxiter) {
print ("Trust region Newton method did not converge!");
}
print ("COMPUTING HESSIAN...");
H0 = matrix (0, rows = D, cols = D);
H = matrix (0, rows = D, cols = D);
X_exp_Xb_rev_2 = rev(X_exp_Xb);
X_rev_2 = rev(X);
X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
parfor (i in 1:D, check = 0) {
Xi = X[,i];
Xi_rev = rev(Xi);
## ----------Start calculating H0--------------
# part 1 H0_1
Xi_X = X_rev_2[,i:D] * Xi_rev;
Xi_X_rev_agg = cumsum (Xi_X);
Xi_X_rev_agg = select %*% Xi_X_rev_agg;
H0_1 = Xi_X_rev_agg / e_r_rev_agg;
# part 2 H0_2
Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
Xi_agg_rev_agg = cumsum (rev(Xi_agg));
H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
#-----------End calculating H0--------------------
## ----------Start calculating H--------------
# part 1 H_1
Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
# part 2 H_2
Xi_exp_Xb = exp_Xb * Xi;
Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
H_2 = H_2_num / (D_r_rev ^ 2);
H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
#-----------End calculating H--------------------
}
H = H + t(H) - diag( diag (H));
H0 = H0 + t(H0) - diag( diag (H0));
# compute standard error for betas
H_inv = inv (H);
se_b = sqrt (diag (H_inv));
# compute exp(b), Z, Pr[>|Z|]
P = matrix (0, rows = D, cols = 1);
exp_b = exp (b);
Z = b / se_b;
for (i in 1:D) {
Z_i_1_scalar = abs (as.scalar (Z[i,1]));
#FIXME: enable after test setup has been fixed
#P[i,1] = cdf (target = Z_i_1_scalar, dist = "normal");
}
# compute confidence intervals for b
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
CI_l = b - se_b * z_alpha_2;
CI_r = b - se_b + z_alpha_2;
######## SOME STATISTICS AND TESTS
S = matrix(0, 6, 1);
# no. of records
S[1, 1] = N;
# no.of events
S[2, 1] = sum (E);
# log-likelihood
loglik = -o;
S[3, 1] = loglik;
# AIC
AIC = -2 * loglik + 2 * D;
S[4, 1] = AIC;
# Likelihood ratio test
lratio_t = 2 * o_init - 2 * o;
lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
# Rsquare (Cox & Snell)
Rsquare = 1 - exp (-lratio_t / N);
Rsquare_max = 1 - exp (-2 * o_init / N);
S[5, 1] = Rsquare;
S[6, 1] = Rsquare_max;
T = matrix(0, 3, 3);
# Wald test
wald_t = as.scalar (t(b) %*% H %*% b);
wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
T[1, 1] = wald_t;
T[1, 2] = D;
T[1, 3] = wald_p;
# Likelihood ratio test
T[2, 1] = lratio_t;
T[2, 2] = D;
T[2, 3] = lratio_p;
H0_inv = inv (H0);
score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
# T[3, 1] = score_t;
T[3, 2] = D;
# T[3, 3] = score_p;
M = matrix (1, rows = D, cols = 7);
M[, 1] = b;
M[, 2] = exp_b;
# M[, 3] = se_b;
# M[, 4] = Z;
# M[, 5] = P;
# M[, 6] = CI_l;
# M[, 7] = CI_r;
COV = H;
XO = X_orig;
}
ensure_trust_bound = function (double x, double a, double b, double c)
return (double x_new, boolean is_violated)
{
if (a * x^2 + b * x + c > 0) {
is_violated = TRUE;
rad = sqrt (b ^ 2 - 4 * a * c);
if (b >= 0)
x_new = - (2 * c) / (b + rad);
else
x_new = - (b - rad) / (2 * a);
} else {
is_violated = FALSE;
x_new = x;
}
}
update_trust_bound = function (double delta, double sb_distance,
double so_exact, double so_linear_approx, double so_quadratic_approx)
return (double delta)
{
sigma1 = 0.25;
sigma2 = 0.5;
sigma3 = 4.0;
if (so_exact <= so_linear_approx)
alpha = sigma3;
else
alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
rho = so_exact / so_quadratic_approx;
if (rho < 0.0001) {
delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
} else {
if (rho < 0.25) {
delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
} else {
if (rho < 0.75) {
delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
} else {
delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
}
}
}
}