scripts/algorithms/KM.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
 #
 # INPUT   PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME    TYPE     DEFAULT      MEANING
 # ---------------------------------------------------------------------------------------------
 # X       String   ---          Location to read the input matrix X containing the survival data:
 #								timestamps, whether event occurred (1) or data is censored (0), and a number of factors (categorical features)
 #								for grouping and/or stratifying
 # TE	  String   ---          Column indices of X which contain timestamps (first entry) and event information (second entry)
 # GI	  String   ---          Column indices of X corresponding to the factors to be used for grouping
 # SI	  String   ---          Column indices of X corresponding to the factors to be used for stratifying
 # O       String   ---          Location to write the matrix containing the results of the Kaplan-Meier analysis; see below for the description
 # M       String   ---          Location to write Matrix M containing the following statistic: total number of events, median and its confidence intervals;
 #								if survival data for multiple groups and strata are provided each row of M contains the above statistics per group and stratum
 # T 	  String   " "			If survival data from multiple groups available and ttype=log-rank or wilcoxon,
 #								location to write the matrix containing result of the (stratified) test for comparing multiple groups
 # alpha   Double   0.05         Parameter to compute 100*(1-alpha)% confidence intervals for the survivor function and its median
 # etype   String   "greenwood"  Parameter to specify the error type according to "greenwood" (the default) or "peto"
 # ctype   String   "log"        Parameter to modify the confidence interval; "plain" keeps the lower and upper bound of
 #								the confidence interval unmodified,	"log" (the default) corresponds to logistic transformation and
 #								"log-log" corresponds to the complementary log-log transformation
 # ttype   String   "none"   	If survival data for multiple groups is available specifies which test to perform for comparing
 #								survival data across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test
 # fmt     String   "text"       The output format of results of the Kaplan-Meier analysis, such as "text" or "csv"
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data:
 #	each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data with the following schema
 # 	1. col: timestamp
 # 	2. col: no. at risk
 # 	3. col: no. of events
 # 	4. col: Kaplan-Meier estimate of survivor function surv
 # 	5. col: standard error of surv
 # 	6. col: lower 100*(1-alpha)% confidence interval for surv
 # 	7. col: upper 100*(1-alpha)% confidence interval for surv
 # 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number of factors used for grouping
 #	,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
 #	M[,1:k]: unique combination of values in the k factors used for grouping
 #	M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
 #	M[,k+l+1]: total number of records
 #	M[,k+l+2]: total number of events
 #	M[,k+l+3]: median of surv
 #	M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
 #	M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
 #	If the number of groups and strata is equal to 1, M will have 4 columns with
 #	M[,1]: total number of events
 #	M[,2]: median of surv
 #	M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
 #	M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
 # 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
 #	T_GROUPS_OE[,1] = no. of events
 #	T_GROUPS_OE[,2] = observed value (O)
 #	T_GROUPS_OE[,3] = expected value (E)
 #	T_GROUPS_OE[,4] = (O-E)^2/E
 #	T_GROUPS_OE[,5] = (O-E)^2/V
 #	T[1,1] = no. of groups
 #	T[1,2] = degree of freedom for Chi-squared distributed test statistic
 #	T[1,3] = test statistic
 #	T[1,4] = P-value
 # -------------------------------------------------------------------------------------------
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemDS.jar -f KM.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE GI=INPUT_DIR/GI SI=INPUT_DIR/SI O=OUTPUT_DIR/O
 #											M=OUTPUT_DIR/M T=OUTPUT_DIR/T alpha=0.05 etype=greenwood ctype=log fmt=csv

 fileX = $X;
 fileTE = $TE;
 fileGI = ifdef ($GI, " ");
 fileSI = ifdef ($SI, " ");
 fileO = $O;
 fileM = $M;

 # Default values of some parameters
 fileT = ifdef ($T, " ");                  # $T=" "

 fileG = ifdef ($G, " ");                 # $G=" "
 fileS = ifdef ($S, " ");                 # $S=" "
 fmtO = ifdef ($fmt, "text");             # $fmt="text"
 alpha = ifdef ($alpha, 0.05);            # $alpha=0.05
 err_type  = ifdef ($etype, "greenwood"); # $etype="greenwood"
 conf_type = ifdef ($ctype, "log");       # $ctype="log"
 test_type = ifdef ($ttype, "none");      # $ttype="none"

 X = read (fileX);
 TE = read (fileTE);
 if (fileGI != " ") {
 	GI = read (fileGI);
 } else {
     GI = matrix (0, rows = 1, cols = 1);
 }

 if (fileSI != " "){
 	SI = read (fileSI);
 } else {
     SI = matrix (0, rows = 1, cols = 1);
 }

 TE = t(TE);
 GI = t(GI);
 SI = t(SI);

 # check arguments for validity
 if (err_type != "greenwood" & err_type != "peto") {
 	stop (err_type + " is not a valid error type!");
 }

 if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log") {
 	stop (conf_type + " is not a valid confidence type!");
 }

 if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none") {
 	stop (test_type + " is not a valid test type!");
 }

 n_group_cols = ncol (GI);
 n_stratum_cols = ncol (SI);

 # check GI and SI for validity
 GI_1_1 = as.scalar (GI[1,1]);
 SI_1_1 = as.scalar (SI[1,1]);
 if (n_group_cols == 1) {
 	if (GI_1_1 == 0) { # no factors for grouping
 		n_group_cols = 0;
 	}
 } else if (GI_1_1 == 0) {
 	stop ("Matrix GI contains zero entries!");
 }
 if (n_stratum_cols == 1) {
 	if (SI_1_1 == 0) { # no factors for stratifying
 		n_stratum_cols = 0;
 	}
 } else if (SI_1_1 == 0) {
 	stop ("Matrix SI contains zero entries!");
 }

 if (2 + n_group_cols + n_stratum_cols > ncol (X)) {
 	stop ("X has an incorrect number of columns!");
 }

 # reorder cols of X
 if (GI_1_1 == 0 & SI_1_1 == 0) {
 	Is = TE;
 } else if (GI_1_1 == 0) {
 	Is = cbind (TE, SI);
 } else if (SI_1_1 == 0) {
 	Is = cbind (TE, GI);
 } else {
 	Is = cbind (TE, GI, SI);
 }
 X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2 + n_group_cols + n_stratum_cols);

 num_records = nrow (X);
 num_groups = 1;
 num_strata = 1;

 ### compute group id for each record
 print ("Perform grouping...");
 if (n_group_cols > 0) {
 	for (g in 1:n_group_cols) { # sort columns corresponding to groups sequentially
 		X = order (target = X, by = 2 + g);
 	}
 	XG = X[,3:(3 + n_group_cols - 1)];
 	Idx = matrix (1, rows = num_records, cols = 1);
 	Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),3:(2 + n_group_cols)] != X[2:num_records,3:(2 + n_group_cols)]);
 	num_groups = sum (Idx);

 	XG = replace (target = XG, pattern = 0, replacement = Inf);
 	XG = XG * Idx;
 	XG = replace (target = XG, pattern = NaN, replacement = 0);
 	G_cols = removeEmpty (target = XG, margin = "rows");
 	G_cols = replace (target = G_cols, pattern = Inf, replacement = 0);

 	A = removeEmpty (target = diag (Idx), margin = "cols");
 	if (ncol (A) > 1) {
 		A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
 		B = cumsum (A);
 		Gi = B %*% seq(1, ncol(B)); # group ids
 	} else { # there is only one group
 		Gi = matrix (1, rows = num_records, cols = 1);
 	}
 	if (n_stratum_cols > 0) {
 		X = cbind (X[,1:2], Gi, X[,(3 + g):ncol (X)]);
 	} else { # no strata
 		X = cbind (X[,1:2],Gi);
 	}
 }

 ### compute stratum id for each record
 print ("Perform stratifying...");
 if (n_stratum_cols > 0) {
 	s_offset = 2;
 	if (n_group_cols > 0) {
 		s_offset = 3;
 	}
 	for (s in 1:n_stratum_cols) { # sort columns corresponding to strata sequentially
 		X = order (target = X, by = s_offset + s);
 	}
 	XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
 	Idx = matrix (1, rows = num_records, cols = 1);
 	Idx[2:num_records,] = rowMaxs (X[1:(num_records - 1),(s_offset + 1):(s_offset + n_stratum_cols)] != X[2:num_records,(s_offset + 1):(s_offset + n_stratum_cols)]);
 	num_strata = sum (Idx);

 	XS = replace (target = XS, pattern = 0, replacement = "Infinity");
 	XS = XS * Idx;
 	XS = replace (target = XS, pattern = NaN, replacement = 0);
 	S_cols = removeEmpty (target = XS, margin = "rows");
 	S_cols = replace (target = S_cols, pattern = Inf, replacement = 0);

 	SB = removeEmpty (target = seq (1,num_records), margin = "rows", select = Idx); # indices of stratum boundaries
 	A = removeEmpty (target = diag (Idx), margin = "cols");
 	if (ncol (A) > 1) {
 		A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
 		B = cumsum (A);
 		Si = B %*% seq(1, ncol(B)); # stratum ids
 	} else { # there is only one stratum
 		Si = matrix (1, rows = num_records, cols = 1);
 	}
 	X = cbind (X[,1:3],Si);
 }

 if (n_group_cols == 0 & n_stratum_cols == 0) {
 	X = cbind (X, matrix (1, rows = num_records, cols = 2));
 	SB = matrix (1, rows = 1, cols = 1);
 } else if (n_group_cols == 0) {
 	X = cbind (X[,1:2], matrix (1, rows = num_records, cols = 1), X[,3]);
 } else if (n_stratum_cols == 0) {
 	X = cbind (X, matrix (1, rows = num_records, cols = 1));
 	SB = matrix (1, rows = 1, cols = 1);
 }

 ######## BEGIN KAPLAN-MEIER ANALYSIS
 print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");

 KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
 KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
 GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
 a = 1/0;
 M = matrix (a, rows = num_groups * num_strata, cols = 5);
 M_cols = seq (1, num_groups * num_strata);
 z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");

 if (num_groups > 1 & test_type != "none") {
 	str = "";
 	TEST = matrix (0, rows = num_groups, cols = 5);
 	TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
 	U = matrix (0, rows = num_groups, cols = num_strata);
 	U_OE = matrix (0, rows = num_groups, cols = num_strata);
 	OBS = matrix (0, rows = num_groups, cols = num_strata);
 	EXP = matrix (0, rows = num_groups, cols = num_strata);
 	V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) * num_strata);
 	n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
 } else if (num_groups == 1 & test_type != "none") {
 	stop ("Data contains only one group or no groups, at least two groups are required for test!");
 }

 parfor (s in 1:num_strata, check = 0) {

 	start_ind = as.scalar (SB[s,]);
 	end_ind = num_records;
 	if (s != num_strata) {
 		end_ind = as.scalar (SB[s + 1,]) - 1;
 	}

 	######## RECODING TIMESTAMPS PRESERVING THE ORDER

 	X_cur = X[start_ind:end_ind,];
 	range = end_ind - start_ind + 1;
 	X_cur = order (target = X_cur, by = 1);
 	Idx1 = matrix (1, rows = range, cols = 1);

 	num_timestamps = 1;
 	if (range == 1) {
 		RT = matrix (1, rows = 1, cols = 1);
 	} else {
 		Idx1[2:range,1] = (X_cur[1:(range - 1),1] != X_cur[2:range,1]);
 		num_timestamps = sum (Idx1);
 		A1 = removeEmpty (target = diag (Idx1), margin = "cols");
 		if (ncol (A1) > 1) {
 			A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
 			B1 = cumsum (A1);
 			RT = B1 %*% seq(1, ncol(B1));
 		} else { # there is only one group
 			RT = matrix (1, rows = range, cols = 1);
 		}
 	}

 	T = X_cur[,1];
 	E = X_cur[,2];
 	G = X_cur[,3];
 	S = X_cur[,4];

 	n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no. of uncensored events per stratum
 	n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); # no. both censored and uncensored of events per stratum
 	Idx1 = cumsum (n_event_all_stratum);
 	time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct timestamps both censored and uncensored per stratum
 	time_stratum_has_zero = sum (time_stratum == 0) > 0;
 	if (time_stratum_has_zero) {
 		time_stratum = 	replace (target = time_stratum, pattern = 0, replacement = Inf);
 	}
 	n_time_all1 = nrow (n_event_stratum);  # no. of distinct timestamps both censored and uncensored per stratum
 	n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
 	if (n_time_all1 > 1) {
 		n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
 	}
 	n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum

 	if (num_groups > 1 & test_type != "none") {	# needed for log-rank or wilcoxon test
 		n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols = num_groups * 2);
 	}

 	parfor (g in 1:num_groups, check = 0) {

 		group_ind = (G == g);
 		KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
 		M_offset = (s - 1) * num_groups + g;
 		if (sum (group_ind) != 0) { # group g is present in the stratum s

 			GSI_offset = (s - 1) * num_groups + g;
 			GSI[GSI_offset,1] = g;
 			GSI[GSI_offset,2] = s;
 			E_cur = E * group_ind;

 			######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP

 			n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no. of uncensored events per stratum per group
 			n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum"); # no. of both censored and uncensored events per stratum per group
 			Idx1 = cumsum (n_event_all);
 			event_occurred = (n_event > 0);
 			if (time_stratum_has_zero) {
 				time = replace (target = time_stratum * event_occurred, pattern = NaN, replacement = 0);
 				time = removeEmpty (target = time, margin = "rows");
 				time = replace (target = time, pattern = Inf, replacement = 0);
 			} else {
 				time = removeEmpty (target = time_stratum * event_occurred, margin = "rows");
 			}
 			n_time_all2 = nrow (n_event);  # no. of distinct timestamps both censored and uncensored per stratum per group
 			n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1);
 			if (n_time_all2 > 1) {
 				n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),];
 			}

 			n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum per group

 			if (num_groups > 1 & test_type != "none") {
 				n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
 				n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;
 			}

 			# Extract only rows corresponding to events, i.e., for which n_event is nonzero
 			Idx1 = (n_event != 0);
 			KM_1 = matrix (0, rows = n_time_all2, cols = 2);
 			KM_1[,1] = n_risk;
 			KM_1[,2] = n_event;
 			KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
 			n_risk = KM_1[,1];
 			n_event = KM_1[,2];
 			n_time = nrow (time);

 			######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV, AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
 			surv = cumprod ((n_risk - n_event) / n_risk);
 			tmp = n_event / (n_risk * (n_risk - n_event));
 			se_surv = sqrt (cumsum (tmp)) * surv;
 			if (err_type == "peto") {
 				se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));
 			}

 			if (conf_type == "plain") {
 				# True survivor function is in [surv +- z_alpha_2 * se_surv],
 				# values less than 0 are replaced by 0, values larger than 1are replaced by 1!
 				CI_l = max (surv - (z_alpha_2 * se_surv), 0);
 				CI_r = min (surv + (z_alpha_2 * se_surv), 1);
 			} else if (conf_type == "log") {
 				# True survivor function is in [surv * exp(+- z_alpha_2 * se_surv / surv)]
 				CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0);
 				CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1);
 			} else { # conf_type == "log-log"
 				# True survivor function is in [surv ^ exp(+- z_alpha_2 * se(log(-log(surv))))]
 				CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0);
 				CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);
 			}
 			#
 			if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
 				CI_l[n_time,] = 0/0;
 				CI_r[n_time,] = 0/0;
 			}

 			n_event_sum = sum (n_event);
 			n_event_sum_all = sum(n_event_all);
 			if (n_event_sum > 0) {
 				# KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
 				KM[1:n_time,KM_offset + 1] = time;
 				KM[1:n_time,KM_offset + 2] = n_risk;
 				KM[1:n_time,KM_offset + 3] = n_event;
 				KM[1:n_time,KM_offset + 4] = surv;
 				KM[1:n_time,KM_offset + 5] = se_surv;
 				KM[1:n_time,KM_offset + 6] = CI_l;
 				KM[1:n_time,KM_offset + 7] = CI_r;
 			}

 			######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL

 			p_5 = (surv <= 0.5);
 			pn_5 = sum (p_5);
 			#M_offset = (s - 1) * num_groups + g;
 			# if the estimated survivor function is larger than 0.5 for all timestamps median does not exist!
 			p_5_exists = (pn_5 != 0);
 			M[M_offset,2] = n_event_sum;
 			M[M_offset,1] = n_event_sum_all;
 			if (p_5_exists) {
 				if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the estimated survivor function is exactly equal to 0.5
 					if (pn_5 > 1) {
 						t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time - pn_5 + 2,1])/2);
 					} else {
 						t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
 					}
 				} else {
 					t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
 				}

 				l_ind = (CI_l <= 0.5);
 				r_ind = (CI_r <= 0.5);
 				l_ind_sum = sum (l_ind);
 				r_ind_sum = sum (r_ind);
 				l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
 				r_min_ind = as.scalar (rowIndexMin (t(r_ind)));
 				if (l_min_ind == n_time) {
 					if (l_ind_sum > 0) {
 						if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
 							M[M_offset,4] = time[n_time - l_ind_sum,1];
 						} else {
 							M[M_offset,4] = time[1,1];
 						}
 					}
 				} else {
 					M[M_offset,4] = time[l_min_ind + 1,1];
 				}
 				#
 				if (r_min_ind == n_time) {
 					if (r_ind_sum > 0) {
 						if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
 							M[M_offset,5] = time[n_time - r_ind_sum,1];
 						} else {
 							M[M_offset,5] = time[1,1];
 						}
 					}
 				} else {
 					M[M_offset,5] = time[r_min_ind + 1,1];
 				}
 				M[M_offset,3] = t_5;
 				if (test_type != "none"){
 					n_event_all_global[g,s] = n_event_sum_all;
 				}
 			}
 		} else {
 			print ("group " + g + " is not present in the stratum " + s);
 			KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows = 7, cols = 1);
 			M_cols[M_offset,1] = 0;
 		}
 	}


 	######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON TEST

 	if (num_groups > 1 & test_type != "none") {

 		V = matrix (0, rows = num_groups-1, cols = num_groups-1);
 		parfor (g in 0:(num_groups-1), check = 0) {

 			n_risk = n_risk_n_event_stratum[,g * 2 + 1];
 			n_event = n_risk_n_event_stratum[,g * 2 + 2];

 			if (test_type == "log-rank") {
 				O = n_event;
 				E = n_risk * n_event_stratum / n_risk_stratum;
 			} else { ### test_type == "wilcoxon"
 				O = n_risk_stratum * n_event / range;
 				E = n_risk * n_event_stratum / range;
 			}
 			U[(g + 1),s] = sum (O - E);
 			U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
 			OBS[g + 1, s] = sum(O);
 			EXP[g + 1, s] = sum(E);
 		}

 		# parfor (i1 in 0:(num_groups - 2), check = 0) {
 		for (i1 in 0:(num_groups - 2), check = 0) {

 			n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
 			n_event = n_risk_n_event_stratum[,2 + i1 * 2];
 			for (i2 in 0:(num_groups - 2)) {

 				n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2];
 				I_i1i2 = 0;
 				if (i1 == i2) {
 					I_i1i2 = 1;
 				}
 				if (test_type == "log-rank") {
 					V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
 					V1 = replace (target = V1, pattern = NaN, replacement = 0);
 					V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
 					V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
 				} else { ### test_type == "wilcoxon"
 					V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
 					V1 = replace (target = V1, pattern = NaN, replacement = 0);
 					V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
 					V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
 				}
 			}
 		}
 		V_start_ind = (s - 1) * (num_groups - 1) + 1;
 		V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
 	}
 }

 if (num_groups > 1 & test_type != "none") {
 	V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
 	for (s in 1:num_strata) {
 		V_start_ind = (s - 1) * (num_groups - 1) + 1;
 	    V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)];
 		V_sum = V_sum + V_sum_total_part;
 	}

 	U_sum = rowSums (U);

 	test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*% U_sum[1:(num_groups-1),1]);
 	p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
 	if (test_type != "none") {
 		U_OE_sum = rowSums(U_OE);
 		V_OE =rowSums((U*U) /sum(V_sum));
 		TEST_GROUPS_OE[1,1] = num_groups;
 		TEST_GROUPS_OE[1,2] = num_groups - 1;
 		TEST_GROUPS_OE[1,3] = test_st;
 		TEST_GROUPS_OE[1,4] = p_val;
 		TEST[,1] = rowSums(n_event_all_global);
 		TEST[,2] = rowSums(OBS);
 		TEST[,3] = rowSums(EXP);
 		TEST[,4] = rowSums(U_OE_sum);
 		TEST[,5] = rowSums(V_OE);
 		str = append (str, test_type + " test for " + num_groups + " groups: Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + " ");
 	}
 }

 GSI = removeEmpty (target = GSI, margin = "rows");
 if (n_group_cols > 0) {
 	# making a copy of unique groups before adding new rows depending on strata
 	G_cols_original = G_cols;

 	GSI_1 = GSI[,1];
 	tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
 	G_cols = tab_G %*% G_cols;
 }

 if (n_stratum_cols > 0) {
 	GSI_2 = GSI[,2];
 	tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
 	S_cols = tab_S %*% S_cols;
 }

 # pull out non-empty rows from M
 M_cols = removeEmpty (target = M_cols, margin = "rows");
 tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
 M = tab_M %*% M;
 M = replace (target = M, pattern = Inf, replacement = NaN);

 # pull out non-empty rows from TEST
 if (n_group_cols > 0 & n_stratum_cols > 0) {
 	M = cbind (G_cols, S_cols, M);
 	if (test_type != "none") {
 		TEST = cbind (G_cols_original, TEST);
 	}
 } else if (n_group_cols > 0) {
 	M = cbind (G_cols, M);
 	if (test_type != "none") {
 		TEST = cbind (G_cols_original, TEST);
 	}
 } else if (n_stratum_cols > 0) {
 	M = cbind (S_cols, M);
 }

 # pull out non-empty columns from KM
 KM = t (cbind (t (KM), KM_cols_select) * KM_cols_select);
 KM = removeEmpty (target = KM, margin = "cols");
 KM = removeEmpty (target = KM, margin = "rows");
 KM = KM[1:(nrow (KM) - 1),];

 # write output matrices
 write (M, fileM, format=fmtO);
 write (KM, fileO, format=fmtO);

 if (test_type != "none") {
 	if (num_groups > 1 & fileT != " ") {
 		write (TEST, fileT, format=fmtO);
 		write (TEST_GROUPS_OE, fileT+".groups.oe", format=fmtO);
 	} else {
 		print (str);
 	}
 }