blob: d648c50e7146348d717f716e1fda34e163304856 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X String --- Location to read the input matrix X containing the survival data:
# timestamps, whether event occurred (1) or data is censored (0), and a number of factors (categorical features)
# for grouping and/or stratifying
# TE String --- Column indices of X which contain timestamps (first entry) and event information (second entry)
# GI String --- Column indices of X corresponding to the factors to be used for grouping
# SI String --- Column indices of X corresponding to the factors to be used for stratifying
# O String --- Location to write the matrix containing the results of the Kaplan-Meier analysis; see below for the description
# M String --- Location to write Matrix M containing the following statistic: total number of events, median and its confidence intervals;
# if survival data for multiple groups and strata are provided each row of M contains the above statistics per group and stratum
# T String " " If survival data from multiple groups available and ttype=log-rank or wilcoxon,
# location to write the matrix containing result of the (stratified) test for comparing multiple groups
# alpha Double 0.05 Parameter to compute 100*(1-alpha)% confidence intervals for the survivor function and its median
# etype String "greenwood" Parameter to specify the error type according to "greenwood" (the default) or "peto"
# ctype String "log" Parameter to modify the confidence interval; "plain" keeps the lower and upper bound of
# the confidence interval unmodified, "log" (the default) corresponds to logistic transformation and
# "log-log" corresponds to the complementary log-log transformation
# ttype String "none" If survival data for multiple groups is available specifies which test to perform for comparing
# survival data across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test
# fmt String "text" The output format of results of the Kaplan-Meier analysis, such as "text" or "csv"
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data:
# each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data with the following schema
# 1. col: timestamp
# 2. col: no. at risk
# 3. col: no. of events
# 4. col: Kaplan-Meier estimate of survivor function surv
# 5. col: standard error of surv
# 6. col: lower 100*(1-alpha)% confidence interval for surv
# 7. col: upper 100*(1-alpha)% confidence interval for surv
# 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number of factors used for grouping
# ,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
# M[,1:k]: unique combination of values in the k factors used for grouping
# M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
# M[,k+l+1]: total number of records
# M[,k+l+2]: total number of events
# M[,k+l+3]: median of surv
# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
# If the number of groups and strata is equal to 1, M will have 4 columns with
# M[,1]: total number of events
# M[,2]: median of surv
# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
# 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
# T_GROUPS_OE[,1] = no. of events
# T_GROUPS_OE[,2] = observed value (O)
# T_GROUPS_OE[,3] = expected value (E)
# T_GROUPS_OE[,4] = (O-E)^2/E
# T_GROUPS_OE[,5] = (O-E)^2/V
# T[1,1] = no. of groups
# T[1,2] = degree of freedom for Chi-squared distributed test statistic
# T[1,3] = test statistic
# T[1,4] = P-value
# -------------------------------------------------------------------------------------------
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f KM.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE GI=INPUT_DIR/GI SI=INPUT_DIR/SI O=OUTPUT_DIR/O
# M=OUTPUT_DIR/M T=OUTPUT_DIR/T alpha=0.05 etype=greenwood ctype=log fmt=csv
fileX = $X;
fileTE = $TE;
fileGI = ifdef ($GI, " ");
fileSI = ifdef ($SI, " ");
fileO = $O;
fileM = $M;
# Default values of some parameters
fileT = ifdef ($T, " "); # $T=" "
fileG = ifdef ($G, " "); # $G=" "
fileS = ifdef ($S, " "); # $S=" "
fmtO = ifdef ($fmt, "text"); # $fmt="text"
alpha = ifdef ($alpha, 0.05); # $alpha=0.05
err_type = ifdef ($etype, "greenwood"); # $etype="greenwood"
conf_type = ifdef ($ctype, "log"); # $ctype="log"
test_type = ifdef ($ttype, "none"); # $ttype="none"
X = read (fileX);
TE = read (fileTE);
if (fileGI != " ") {
GI = read (fileGI);
} else {
GI = matrix (0, rows = 1, cols = 1);
}
if (fileSI != " "){
SI = read (fileSI);
} else {
SI = matrix (0, rows = 1, cols = 1);
}
TE = t(TE);
GI = t(GI);
SI = t(SI);
# check arguments for validity
if (err_type != "greenwood" & err_type != "peto") {
stop (err_type + " is not a valid error type!");
}
if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log") {
stop (conf_type + " is not a valid confidence type!");
}
if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none") {
stop (test_type + " is not a valid test type!");
}
n_group_cols = ncol (GI);
n_stratum_cols = ncol (SI);
# check GI and SI for validity
GI_1_1 = as.scalar (GI[1,1]);
SI_1_1 = as.scalar (SI[1,1]);
if (n_group_cols == 1) {
if (GI_1_1 == 0) { # no factors for grouping
n_group_cols = 0;
}
} else if (GI_1_1 == 0) {
stop ("Matrix GI contains zero entries!");
}
if (n_stratum_cols == 1) {
if (SI_1_1 == 0) { # no factors for stratifying
n_stratum_cols = 0;
}
} else if (SI_1_1 == 0) {
stop ("Matrix SI contains zero entries!");
}
if (2 + n_group_cols + n_stratum_cols > ncol (X)) {
stop ("X has an incorrect number of columns!");
}
# reorder cols of X
if (GI_1_1 == 0 & SI_1_1 == 0) {
Is = TE;
} else if (GI_1_1 == 0) {
Is = append (TE, SI);
} else if (SI_1_1 == 0) {
Is = append (TE, GI);
} else {
Is = append (TE, append (GI, SI));
}
X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2 + n_group_cols + n_stratum_cols);
num_records = nrow (X);
num_groups = 1;
num_strata = 1;
### compute group id for each record
print ("Perform grouping...");
if (n_group_cols > 0) {
for (g in 1:n_group_cols) { # sort columns corresponding to groups sequentially
X = order (target = X, by = 2 + g);
}
XG = X[,3:(3 + n_group_cols - 1)];
Idx = matrix (1, rows = num_records, cols = 1);
Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),3:(2 + n_group_cols)] != X[2:num_records,3:(2 + n_group_cols)]);
num_groups = sum (Idx);
XG = replace (target = XG, pattern = 0, replacement = "Infinity");
XG = XG * Idx;
XG = replace (target = XG, pattern = "NaN", replacement = 0);
G_cols = removeEmpty (target = XG, margin = "rows");
G_cols = replace (target = G_cols, pattern = "Infinity", replacement = 0);
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
Gi = B %*% seq(1, ncol(B)); # group ids
} else { # there is only one group
Gi = matrix (1, rows = num_records, cols = 1);
}
if (n_stratum_cols > 0) {
X = append (append (X[,1:2],Gi), X[,(3 + g):ncol (X)]);
} else { # no strata
X = append (X[,1:2],Gi);
}
}
### compute stratum id for each record
print ("Perform stratifying...");
if (n_stratum_cols > 0) {
s_offset = 2;
if (n_group_cols > 0) {
s_offset = 3;
}
for (s in 1:n_stratum_cols) { # sort columns corresponding to strata sequentially
X = order (target = X, by = s_offset + s);
}
XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
Idx = matrix (1, rows = num_records, cols = 1);
Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),(s_offset + 1):(s_offset + n_stratum_cols)] != X[2:num_records,(s_offset + 1):(s_offset + n_stratum_cols)]);
num_strata = sum (Idx);
XS = replace (target = XS, pattern = 0, replacement = "Infinity");
XS = XS * Idx;
XS = replace (target = XS, pattern = "NaN", replacement = 0);
S_cols = removeEmpty (target = XS, margin = "rows");
S_cols = replace (target = S_cols, pattern = "Infinity", replacement = 0);
SB = removeEmpty (target = seq (1,num_records), margin = "rows", select = Idx); # indices of stratum boundaries
A = removeEmpty (target = diag (Idx), margin = "cols");
if (ncol (A) > 1) {
A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
B = cumsum (A);
Si = B %*% seq(1, ncol(B)); # stratum ids
} else { # there is only one stratum
Si = matrix (1, rows = num_records, cols = 1);
}
X = append (X[,1:3],Si);
}
if (n_group_cols == 0 & n_stratum_cols == 0) {
X = append (X, matrix (1, rows = num_records, cols = 2));
SB = matrix (1, rows = 1, cols = 1);
} else if (n_group_cols == 0) {
X = append (X[,1:2], append (matrix (1, rows = num_records, cols = 1), X[,3]));
} else if (n_stratum_cols == 0) {
X = append (X, matrix (1, rows = num_records, cols = 1));
SB = matrix (1, rows = 1, cols = 1);
}
######## BEGIN KAPLAN-MEIER ANALYSIS
print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");
KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
a = 1/0;
M = matrix (a, rows = num_groups * num_strata, cols = 5);
M_cols = seq (1, num_groups * num_strata);
z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
if (num_groups > 1 & test_type != "none") {
str = "";
TEST = matrix (0, rows = num_groups, cols = 5);
TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
U = matrix (0, rows = num_groups, cols = num_strata);
U_OE = matrix (0, rows = num_groups, cols = num_strata);
OBS = matrix (0, rows = num_groups, cols = num_strata);
EXP = matrix (0, rows = num_groups, cols = num_strata);
V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) * num_strata);
n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
} else if (num_groups == 1 & test_type != "none") {
stop ("Data contains only one group or no groups, at least two groups are required for test!");
}
parfor (s in 1:num_strata, check = 0) {
start_ind = as.scalar (SB[s,]);
end_ind = num_records;
if (s != num_strata) {
end_ind = as.scalar (SB[s + 1,]) - 1;
}
######## RECODING TIMESTAMPS PRESERVING THE ORDER
X_cur = X[start_ind:end_ind,];
range = end_ind - start_ind + 1;
X_cur = order (target = X_cur, by = 1);
Idx1 = matrix (1, rows = range, cols = 1);
num_timestamps = 1;
if (range == 1) {
RT = matrix (1, rows = 1, cols = 1);
} else {
Idx1[2:range,1] = (X_cur[1:(range - 1),1] != X_cur[2:range,1]);
num_timestamps = sum (Idx1);
A1 = removeEmpty (target = diag (Idx1), margin = "cols");
if (ncol (A1) > 1) {
A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
B1 = cumsum (A1);
RT = B1 %*% seq(1, ncol(B1));
} else { # there is only one group
RT = matrix (1, rows = range, cols = 1);
}
}
T = X_cur[,1];
E = X_cur[,2];
G = X_cur[,3];
S = X_cur[,4];
n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no. of uncensored events per stratum
n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); # no. both censored and uncensored of events per stratum
Idx1 = cumsum (n_event_all_stratum);
time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct timestamps both censored and uncensored per stratum
time_stratum_has_zero = sum (time_stratum == 0) > 0;
if (time_stratum_has_zero) {
time_stratum = replace (target = time_stratum, pattern = 0, replacement = "Infinity");
}
n_time_all1 = nrow (n_event_stratum); # no. of distinct timestamps both censored and uncensored per stratum
n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
if (n_time_all1 > 1) {
n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
}
n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum
if (num_groups > 1 & test_type != "none") { # needed for log-rank or wilcoxon test
n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols = num_groups * 2);
}
parfor (g in 1:num_groups, check = 0) {
group_ind = (G == g);
KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
M_offset = (s - 1) * num_groups + g;
if (sum (group_ind) != 0) { # group g is present in the stratum s
GSI_offset = (s - 1) * num_groups + g;
GSI[GSI_offset,1] = g;
GSI[GSI_offset,2] = s;
E_cur = E * group_ind;
######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP
n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no. of uncensored events per stratum per group
n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum"); # no. of both censored and uncensored events per stratum per group
Idx1 = cumsum (n_event_all);
event_occurred = (n_event > 0);
if (time_stratum_has_zero) {
time = replace (target = time_stratum * event_occurred, pattern = "NaN", replacement = 0);
time = removeEmpty (target = time, margin = "rows");
time = replace (target = time, pattern = "Infinity", replacement = 0);
} else {
time = removeEmpty (target = time_stratum * event_occurred, margin = "rows");
}
n_time_all2 = nrow (n_event); # no. of distinct timestamps both censored and uncensored per stratum per group
n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1);
if (n_time_all2 > 1) {
n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),];
}
n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum per group
if (num_groups > 1 & test_type != "none") {
n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;
}
# Extract only rows corresponding to events, i.e., for which n_event is nonzero
Idx1 = (n_event != 0);
KM_1 = matrix (0, rows = n_time_all2, cols = 2);
KM_1[,1] = n_risk;
KM_1[,2] = n_event;
KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
n_risk = KM_1[,1];
n_event = KM_1[,2];
n_time = nrow (time);
######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV, AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
surv = cumprod ((n_risk - n_event) / n_risk);
tmp = n_event / (n_risk * (n_risk - n_event));
se_surv = sqrt (cumsum (tmp)) * surv;
if (err_type == "peto") {
se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));
}
if (conf_type == "plain") {
# True survivor function is in [surv +- z_alpha_2 * se_surv],
# values less than 0 are replaced by 0, values larger than 1are replaced by 1!
CI_l = max (surv - (z_alpha_2 * se_surv), 0);
CI_r = min (surv + (z_alpha_2 * se_surv), 1);
} else if (conf_type == "log") {
# True survivor function is in [surv * exp(+- z_alpha_2 * se_surv / surv)]
CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0);
CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1);
} else { # conf_type == "log-log"
# True survivor function is in [surv ^ exp(+- z_alpha_2 * se(log(-log(surv))))]
CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0);
CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);
}
#
if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
CI_l[n_time,] = 0/0;
CI_r[n_time,] = 0/0;
}
n_event_sum = sum (n_event);
n_event_sum_all = sum(n_event_all);
if (n_event_sum > 0) {
# KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
KM[1:n_time,KM_offset + 1] = time;
KM[1:n_time,KM_offset + 2] = n_risk;
KM[1:n_time,KM_offset + 3] = n_event;
KM[1:n_time,KM_offset + 4] = surv;
KM[1:n_time,KM_offset + 5] = se_surv;
KM[1:n_time,KM_offset + 6] = CI_l;
KM[1:n_time,KM_offset + 7] = CI_r;
}
######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
p_5 = (surv <= 0.5);
pn_5 = sum (p_5);
#M_offset = (s - 1) * num_groups + g;
# if the estimated survivor function is larger than 0.5 for all timestamps median does not exist!
p_5_exists = (pn_5 != 0);
M[M_offset,2] = n_event_sum;
M[M_offset,1] = n_event_sum_all;
if (p_5_exists) {
if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the estimated survivor function is exactly equal to 0.5
if (pn_5 > 1) {
t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time - pn_5 + 2,1])/2);
} else {
t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
}
} else {
t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
}
l_ind = (CI_l <= 0.5);
r_ind = (CI_r <= 0.5);
l_ind_sum = sum (l_ind);
r_ind_sum = sum (r_ind);
l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
r_min_ind = as.scalar (rowIndexMin (t(r_ind)));
if (l_min_ind == n_time) {
if (l_ind_sum > 0) {
if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
M[M_offset,4] = time[n_time - l_ind_sum,1];
} else {
M[M_offset,4] = time[1,1];
}
}
} else {
M[M_offset,4] = time[l_min_ind + 1,1];
}
#
if (r_min_ind == n_time) {
if (r_ind_sum > 0) {
if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
M[M_offset,5] = time[n_time - r_ind_sum,1];
} else {
M[M_offset,5] = time[1,1];
}
}
} else {
M[M_offset,5] = time[r_min_ind + 1,1];
}
M[M_offset,3] = t_5;
if (test_type != "none"){
n_event_all_global[g,s] = n_event_sum_all;
}
}
} else {
print ("group " + g + " is not present in the stratum " + s);
KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows = 7, cols = 1);
M_cols[M_offset,1] = 0;
}
}
######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON TEST
if (num_groups > 1 & test_type != "none") {
V = matrix (0, rows = num_groups-1, cols = num_groups-1);
parfor (g in 0:(num_groups-1), check = 0) {
n_risk = n_risk_n_event_stratum[,g * 2 + 1];
n_event = n_risk_n_event_stratum[,g * 2 + 2];
if (test_type == "log-rank") {
O = n_event;
E = n_risk * n_event_stratum / n_risk_stratum;
} else { ### test_type == "wilcoxon"
O = n_risk_stratum * n_event / range;
E = n_risk * n_event_stratum / range;
}
U[(g + 1),s] = sum (O - E);
U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
OBS[g + 1, s] = sum(O);
EXP[g + 1, s] = sum(E);
}
# parfor (i1 in 0:(num_groups - 2), check = 0) {
for (i1 in 0:(num_groups - 2), check = 0) {
n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
n_event = n_risk_n_event_stratum[,2 + i1 * 2];
for (i2 in 0:(num_groups - 2)) {
n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2];
I_i1i2 = 0;
if (i1 == i2) {
I_i1i2 = 1;
}
if (test_type == "log-rank") {
V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
V1 = replace (target = V1, pattern = "NaN", replacement = 0);
V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
} else { ### test_type == "wilcoxon"
V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
V1 = replace (target = V1, pattern = "NaN", replacement = 0);
V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
}
}
}
V_start_ind = (s - 1) * (num_groups - 1) + 1;
V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
}
}
if (num_groups > 1 & test_type != "none") {
V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
for (s in 1:num_strata) {
V_start_ind = (s - 1) * (num_groups - 1) + 1;
V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)];
V_sum = V_sum + V_sum_total_part;
}
U_sum = rowSums (U);
test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*% U_sum[1:(num_groups-1),1]);
p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
if (test_type != "none") {
U_OE_sum = rowSums(U_OE);
V_OE =rowSums((U*U) /sum(V_sum));
TEST_GROUPS_OE[1,1] = num_groups;
TEST_GROUPS_OE[1,2] = num_groups - 1;
TEST_GROUPS_OE[1,3] = test_st;
TEST_GROUPS_OE[1,4] = p_val;
TEST[,1] = rowSums(n_event_all_global);
TEST[,2] = rowSums(OBS);
TEST[,3] = rowSums(EXP);
TEST[,4] = rowSums(U_OE_sum);
TEST[,5] = rowSums(V_OE);
str = append (str, test_type + " test for " + num_groups + " groups: Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + " ");
}
}
GSI = removeEmpty (target = GSI, margin = "rows");
if (n_group_cols > 0) {
# making a copy of unique groups before adding new rows depending on strata
G_cols_original = G_cols;
GSI_1 = GSI[,1];
tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
G_cols = tab_G %*% G_cols;
}
if (n_stratum_cols > 0) {
GSI_2 = GSI[,2];
tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
S_cols = tab_S %*% S_cols;
}
# pull out non-empty rows from M
M_cols = removeEmpty (target = M_cols, margin = "rows");
tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
M = tab_M %*% M;
M = replace (target = M, pattern = "Infinity", replacement = "NaN");
# pull out non-empty rows from TEST
if (n_group_cols > 0 & n_stratum_cols > 0) {
M = append (append (G_cols, S_cols), M);
if (test_type != "none") {
TEST = append (G_cols_original, TEST);
}
} else if (n_group_cols > 0) {
M = append (G_cols, M);
if (test_type != "none") {
TEST = append (G_cols_original, TEST);
}
} else if (n_stratum_cols > 0) {
M = append (S_cols, M);
}
# pull out non-empty columns from KM
KM = t (append (t (KM), KM_cols_select) * KM_cols_select);
KM = removeEmpty (target = KM, margin = "cols");
KM = removeEmpty (target = KM, margin = "rows");
KM = KM[1:(nrow (KM) - 1),];
# write output matrices
write (M, fileM, format=fmtO);
write (KM, fileO, format=fmtO);
if (test_type != "none") {
if (num_groups > 1 & fileT != " ") {
write (TEST, fileT, format=fmtO);
write (TEST_GROUPS_OE, fileT+".groups.oe", format=fmtO);
} else {
print (str);
}
}