src/test/scripts/functions/jmlc/reuse-glm-predict.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Default values for input parameters:
 fileX = $X;
 fileB = $B;
 fileM = ifdef ($M, " ");
 fileY = ifdef ($Y, " ");
 fileO = ifdef ($O, " ");
 fmtM  = ifdef ($fmt, "text");

 dist_type  = ifdef ($dfam, 1);    # $dfam = 1;
 var_power  = ifdef ($vpow, 0.0);  # $vpow = 0.0;
 link_type  = ifdef ($link, 0);    # $link = 0;
 link_power = ifdef ($lpow, 1.0);  # $lpow = 1.0;
 dispersion = ifdef ($disp, 1.0);  # $disp = 1.0;

 var_power  = as.double (var_power);
 link_power = as.double (link_power);
 dispersion = as.double (dispersion);

 if (dist_type == 3) {
     link_type = 2;
 } else { if (link_type == 0) { # Canonical Link
     if (dist_type == 1) {
         link_type = 1;
         link_power = 1.0 - var_power;
     } else { if (dist_type == 2) {
             link_type = 2;
 }}} }

 X = read(fileX);
 #X = table(X[,1], X[,2], X[,3])

 num_records  = nrow (X);
 num_features = ncol (X);

 W = read (fileB);
 if (dist_type == 3) {
     beta =  W [1 : ncol (X),  ];
     intercept = W [nrow(W),  ];
 } else {
     beta =  W [1 : ncol (X), 1];
     intercept = W [nrow(W), 1];
 }
 if (nrow (W) == ncol (X)) {
     intercept = 0.0 * intercept;
     is_intercept = FALSE;
 } else {
     num_features = num_features + 1;
     is_intercept = TRUE;
 }

 ones_rec = matrix (1, rows = num_records, cols = 1);
 linear_terms = X %*% beta + ones_rec %*% intercept;
 [means, vars] =
     glm_means_and_vars (linear_terms, dist_type, var_power, link_type, link_power);

 if (fileM != " ") {
     write (means, fileM, format=fmtM);
 }

 predicted_y = rowIndexMax(means)
 write(predicted_y, $P, format=fmtM)

 if (fileY != " ")
 {
     Y = read (fileY);
     ones_ctg = matrix (1, rows = ncol(Y), cols = 1);

     # Statistics To Compute:

     Z_logl               = NaN;
     Z_logl_pValue        = NaN;
     X2_pearson           = NaN;
     df_pearson           = -1;
     G2_deviance          = NaN;
     df_deviance          = -1;
     X2_pearson_pValue    = NaN;
     G2_deviance_pValue   = NaN;
     Z_logl_scaled        = NaN;
     Z_logl_scaled_pValue = NaN;
     X2_scaled            = NaN;
     X2_scaled_pValue     = NaN;
     G2_scaled            = NaN;
     G2_scaled_pValue     = NaN;

     # set Y_counts to avoid 'Initialization of Y_counts depends on if-else execution' warning
     Y_counts = matrix(0.0, rows=1, cols=1);

     if (dist_type == 1 & link_type == 1) {
     #
     # POWER DISTRIBUTIONS (GAUSSIAN, POISSON, GAMMA, ETC.)
     #
         if (link_power == 0.0) {
             is_zero_Y = (Y == 0.0);
             lt_saturated = log (Y + is_zero_Y) - is_zero_Y / (1.0 - is_zero_Y);
         } else {
             lt_saturated = Y ^ link_power;
         }
         Y_counts = ones_rec;

         X2_pearson = sum ((Y - means) ^ 2 / vars);
         df_pearson = num_records - num_features;

         log_l_part =
             glm_partial_loglikelihood_for_power_dist_and_link (linear_terms, Y, var_power, link_power);
         log_l_part_saturated =
             glm_partial_loglikelihood_for_power_dist_and_link (lt_saturated, Y, var_power, link_power);

         G2_deviance = 2 * sum (log_l_part_saturated) - 2 * sum (log_l_part);
         df_deviance = num_records - num_features;

     } else { if (dist_type >= 2) {
     #
     # BINOMIAL AND MULTINOMIAL DISTRIBUTIONS
     #
         if (ncol (Y) == 1) {
             num_categories = ncol (beta) + 1;
             if (min (Y) <= 0) {
                 # Category labels "0", "-1" etc. are converted into the baseline label
                 Y = Y + (- Y + num_categories) * (Y <= 0);
             }
             Y_size = min (num_categories, max(Y));
             Y_unsized = table (seq (1, num_records, 1), Y);
             Y = matrix (0, rows = num_records, cols = num_categories);
             Y [, 1 : Y_size] = Y_unsized [, 1 : Y_size];
             Y_counts = ones_rec;
         } else {
             Y_counts = rowSums (Y);
         }

         P = means;
         zero_Y = (Y == 0.0);
         zero_P = (P == 0.0);
         ones_ctg = matrix (1, rows = ncol(Y), cols = 1);

         logl_vec = rowSums (Y *  log (P + zero_Y)   );
         ent1_vec = rowSums (P *  log (P + zero_P)   );
         ent2_vec = rowSums (P * (log (P + zero_P))^2);
         E_logl   = sum (Y_counts * ent1_vec);
         V_logl   = sum (Y_counts * (ent2_vec - ent1_vec ^ 2));
         Z_logl   = (sum (logl_vec) - E_logl) / sqrt (V_logl);

         means = means * (Y_counts %*% t(ones_ctg));
         vars  = vars  * (Y_counts %*% t(ones_ctg));

         frac_below_5 = sum (means < 5) / (nrow (means) * ncol (means));
         frac_below_1 = sum (means < 1) / (nrow (means) * ncol (means));

         if (frac_below_5 > 0.2 | frac_below_1 > 0.0) {
             print ("WARNING: residual statistics are inaccurate here due to low cell means.");
         }

         X2_pearson = sum ((Y - means) ^ 2 / means);
         df_pearson = (num_records - num_features) * (ncol(Y) - 1);

         G2_deviance = 2 * sum (Y * log ((Y + zero_Y) / (means + zero_Y)));
         df_deviance = (num_records - num_features) * (ncol(Y) - 1);
     }}

     if (Z_logl == Z_logl) {
         Z_logl_absneg = - abs (Z_logl);
         Z_logl_pValue = 2.0 * pnorm(target = Z_logl_absneg);
     }
     if (X2_pearson == X2_pearson & df_pearson > 0) {
         X2_pearson_pValue = pchisq(target = X2_pearson, df = df_pearson, lower.tail=FALSE);
     }
     if (G2_deviance == G2_deviance & df_deviance > 0) {
         G2_deviance_pValue = pchisq(target = G2_deviance, df = df_deviance, lower.tail=FALSE);
     }

     Z_logl_scaled = Z_logl / sqrt (dispersion);
     X2_scaled = X2_pearson / dispersion;
     G2_scaled = G2_deviance / dispersion;

     if (Z_logl_scaled == Z_logl_scaled) {
         Z_logl_scaled_absneg = - abs (Z_logl_scaled);
         Z_logl_scaled_pValue = 2.0 * pnorm(target = Z_logl_scaled_absneg);
     }
     if (X2_scaled == X2_scaled & df_pearson > 0) {
         X2_scaled_pValue = pchisq(target = X2_scaled, df = df_pearson, lower.tail=FALSE);
     }
     if (G2_scaled == G2_scaled & df_deviance > 0) {
         G2_scaled_pValue = pchisq(target = G2_scaled, df = df_deviance, lower.tail=FALSE);
     }

     avg_tot_Y = colSums (    Y    ) / sum (Y_counts);
     avg_res_Y = colSums (Y - means) / sum (Y_counts);

     ss_avg_tot_Y = colSums ((    Y     - Y_counts %*% avg_tot_Y) ^ 2);
     ss_res_Y     = colSums ((Y - means) ^ 2);
     ss_avg_res_Y = colSums ((Y - means - Y_counts %*% avg_res_Y) ^ 2);

     df_ss_res_Y  = sum (Y_counts) - num_features;
     if (is_intercept) {
         df_ss_avg_res_Y = df_ss_res_Y;
     } else {
         df_ss_avg_res_Y = df_ss_res_Y - 1;
     }

     var_tot_Y = ss_avg_tot_Y / (sum (Y_counts) - 1);
     if (df_ss_avg_res_Y > 0) {
         var_res_Y = ss_avg_res_Y / df_ss_avg_res_Y;
     } else {
         var_res_Y = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
     }
     plain_R2_nobias  = 1 - ss_avg_res_Y / ss_avg_tot_Y;
     adjust_R2_nobias = 1 - var_res_Y / var_tot_Y;
     plain_R2  = 1 - ss_res_Y / ss_avg_tot_Y;
     if (df_ss_res_Y > 0) {
         adjust_R2 = 1 - (ss_res_Y / df_ss_res_Y) / var_tot_Y;
     } else {
         adjust_R2 = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
     }

     predicted_avg_var_res_Y = dispersion * colSums (vars) / sum (Y_counts);

     # PREPARING THE OUTPUT CSV STATISTICS FILE

     str = "LOGLHOOD_Z,,FALSE," + Z_logl;
     str = append (str, "LOGLHOOD_Z_PVAL,,FALSE," + Z_logl_pValue);
     str = append (str, "PEARSON_X2,,FALSE," + X2_pearson);
     str = append (str, "PEARSON_X2_BY_DF,,FALSE," + (X2_pearson / df_pearson));
     str = append (str, "PEARSON_X2_PVAL,,FALSE," + X2_pearson_pValue);
     str = append (str, "DEVIANCE_G2,,FALSE," + G2_deviance);
     str = append (str, "DEVIANCE_G2_BY_DF,,FALSE," + (G2_deviance / df_deviance));
     str = append (str, "DEVIANCE_G2_PVAL,,FALSE," + G2_deviance_pValue);
     str = append (str, "LOGLHOOD_Z,,TRUE," + Z_logl_scaled);
     str = append (str, "LOGLHOOD_Z_PVAL,,TRUE," + Z_logl_scaled_pValue);
     str = append (str, "PEARSON_X2,,TRUE," + X2_scaled);
     str = append (str, "PEARSON_X2_BY_DF,,TRUE," + (X2_scaled / df_pearson));
     str = append (str, "PEARSON_X2_PVAL,,TRUE," + X2_scaled_pValue);
     str = append (str, "DEVIANCE_G2,,TRUE," + G2_scaled);
     str = append (str, "DEVIANCE_G2_BY_DF,,TRUE," + (G2_scaled / df_deviance));
     str = append (str, "DEVIANCE_G2_PVAL,,TRUE," + G2_scaled_pValue);

     for (i in 1:ncol(Y)) {
         str = append (str, "AVG_TOT_Y," + i + ",," + as.scalar (avg_tot_Y [1, i]));
         str = append (str, "STDEV_TOT_Y," + i + ",," + as.scalar (sqrt (var_tot_Y [1, i])));
         str = append (str, "AVG_RES_Y," + i + ",," + as.scalar (avg_res_Y [1, i]));
         str = append (str, "STDEV_RES_Y," + i + ",," + as.scalar (sqrt (var_res_Y [1, i])));
         str = append (str, "PRED_STDEV_RES," + i + ",TRUE," + as.scalar (sqrt (predicted_avg_var_res_Y [1, i])));
         str = append (str, "PLAIN_R2," + i + ",," + as.scalar (plain_R2 [1, i]));
         str = append (str, "ADJUSTED_R2," + i + ",," + as.scalar (adjust_R2 [1, i]));
         str = append (str, "PLAIN_R2_NOBIAS," + i + ",," + as.scalar (plain_R2_nobias [1, i]));
         str = append (str, "ADJUSTED_R2_NOBIAS," + i + ",," + as.scalar (adjust_R2_nobias [1, i]));
     }

     if (fileO != " ") {
         write (str, fileO);
     } else {
         print (str);
     }
 }

 glm_means_and_vars =
     function (Matrix[double] linear_terms, int dist_type, double var_power, int link_type, double link_power)
     return (Matrix[double] means, Matrix[double] vars)
     # NOTE: "vars" represents the variance without dispersion, i.e. the V(mu) function.
 {
     num_points = nrow (linear_terms);
     if (dist_type == 1 & link_type == 1) {
     # POWER DISTRIBUTION
         if          (link_power ==  0.0) {
             y_mean = exp (linear_terms);
         } else { if (link_power ==  1.0) {
             y_mean = linear_terms;
         } else { if (link_power == -1.0) {
             y_mean = 1.0 / linear_terms;
         } else {
             y_mean = linear_terms ^ (1.0 / link_power);
         }}}
         if (var_power == 0.0) {
             var_function = matrix (1.0, rows = num_points, cols = 1);
         } else { if (var_power == 1.0) {
             var_function = y_mean;
         } else {
             var_function = y_mean ^ var_power;
         }}
         means = y_mean;
         vars = var_function;
     } else { if (dist_type == 2 & link_type >= 1 & link_type <= 5) {
     # BINOMIAL/BERNOULLI DISTRIBUTION
         y_prob = matrix (0.0, rows = num_points, cols = 2);
         if          (link_type == 1 & link_power == 0.0)  { # Binomial.log
             y_prob [, 1]  = exp (linear_terms);
             y_prob [, 2]  = 1.0 - y_prob [, 1];
         } else { if (link_type == 1 & link_power != 0.0)  { # Binomial.power_nonlog
             y_prob [, 1]  = linear_terms ^ (1.0 / link_power);
             y_prob [, 2]  = 1.0 - y_prob [, 1];
         } else { if (link_type == 2)                      { # Binomial.logit
             elt = exp (linear_terms);
             y_prob [, 1]  = elt / (1.0 + elt);
             y_prob [, 2]  = 1.0 / (1.0 + elt);
         } else { if (link_type == 3)                      { # Binomial.probit
             sign_lt = 2 * (linear_terms >= 0.0) - 1;
             t_gp = 1.0 / (1.0 + abs (linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
             erf_corr =
                 t_gp * ( 0.254829592
               + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
               + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
               + t_gp * (-1.453152027
               + t_gp *   1.061405429)))) * sign_lt * exp (- (linear_terms ^ 2) / 2.0);
             y_prob [, 1] = (1 + sign_lt) - erf_corr;
             y_prob [, 2] = (1 - sign_lt) + erf_corr;
             y_prob = y_prob / 2;
         } else { if (link_type == 4)                      { # Binomial.cloglog
             elt = exp (linear_terms);
             is_too_small = ((10000000 + elt) == 10000000);
             y_prob [, 2] = exp (- elt);
             y_prob [, 1] = (1 - is_too_small) * (1.0 - y_prob [, 2]) + is_too_small * elt * (1.0 - elt / 2);
         } else { if (link_type == 5)                      { # Binomial.cauchit
             atan_linear_terms = atan (linear_terms);
             y_prob [, 1] = 0.5 + atan_linear_terms / pi;
             y_prob [, 2] = 0.5 - atan_linear_terms / pi;
         }}}}}}
         means = y_prob;
         ones_ctg = matrix (1, rows = 2, cols = 1);
         vars  = means * (means %*% (1 - diag (ones_ctg)));
     } else { if (dist_type == 3) {
     # MULTINOMIAL LOGIT DISTRIBUTION
         elt = exp (linear_terms);
         ones_pts = matrix (1, rows = num_points, cols = 1);
         elt = cbind (elt, ones_pts);
         ones_ctg = matrix (1, rows = ncol (elt), cols = 1);
         means = elt / (rowSums (elt) %*% t(ones_ctg));
         vars  = means * (means %*% (1 - diag (ones_ctg)));
     } else {
         means = matrix (0.0, rows = num_points, cols = 1);
         vars  = matrix (0.0, rows = num_points, cols = 1);
 }   }}}

 glm_partial_loglikelihood_for_power_dist_and_link =   # Assumes: dist_type == 1 & link_type == 1
     function (Matrix[double] linear_terms, Matrix[double] Y, double var_power, double link_power)
     return (Matrix[double] log_l_part)
 {
     num_records = nrow (Y);
     if (var_power == 1.0) { # Poisson
         if (link_power == 0.0)  { # Poisson.log
             is_natural_parameter_log_zero = (linear_terms == -Inf);
             natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
             b_cumulant = exp (linear_terms);
         } else {                  # Poisson.power_nonlog
             is_natural_parameter_log_zero = (linear_terms == 0.0);
             natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
             b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
         }
         is_minus_infinity = (Y > 0) * is_natural_parameter_log_zero;
         log_l_part = Y * natural_parameters - b_cumulant - is_minus_infinity / (1 - is_minus_infinity);
     } else {
         if (var_power == 2.0 & link_power == 0.0)  { # Gamma.log
             natural_parameters = - exp (- linear_terms);
             b_cumulant = linear_terms;
         } else { if (var_power == 2.0)  { # Gamma.power_nonlog
             natural_parameters = - linear_terms ^ (- 1.0 / link_power);
             b_cumulant = log (linear_terms) / link_power;
         } else { if (link_power == 0.0) { # PowerDist.log
             natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
             b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
         } else {                          # PowerDist.power_nonlog
             power_np = (1.0 - var_power) / link_power;
             natural_parameters = (linear_terms ^ power_np) / (1.0 - var_power);
             power_cu = (2.0 - var_power) / link_power;
             b_cumulant = (linear_terms ^ power_cu) / (2.0 - var_power);
         }}}
         log_l_part = Y * natural_parameters - b_cumulant;
 }   }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Default values for input parameters:
	fileX = $X;
	fileB = $B;
	fileM = ifdef ($M, " ");
	fileY = ifdef ($Y, " ");
	fileO = ifdef ($O, " ");
	fmtM = ifdef ($fmt, "text");

	dist_type = ifdef ($dfam, 1); # $dfam = 1;
	var_power = ifdef ($vpow, 0.0); # $vpow = 0.0;
	link_type = ifdef ($link, 0); # $link = 0;
	link_power = ifdef ($lpow, 1.0); # $lpow = 1.0;
	dispersion = ifdef ($disp, 1.0); # $disp = 1.0;

	var_power = as.double (var_power);
	link_power = as.double (link_power);
	dispersion = as.double (dispersion);

	if (dist_type == 3) {
	link_type = 2;
	} else { if (link_type == 0) { # Canonical Link
	if (dist_type == 1) {
	link_type = 1;
	link_power = 1.0 - var_power;
	} else { if (dist_type == 2) {
	link_type = 2;
	}}} }

	X = read(fileX);
	#X = table(X[,1], X[,2], X[,3])

	num_records = nrow (X);
	num_features = ncol (X);

	W = read (fileB);
	if (dist_type == 3) {
	beta = W [1 : ncol (X), ];
	intercept = W [nrow(W), ];
	} else {
	beta = W [1 : ncol (X), 1];
	intercept = W [nrow(W), 1];
	}
	if (nrow (W) == ncol (X)) {
	intercept = 0.0 * intercept;
	is_intercept = FALSE;
	} else {
	num_features = num_features + 1;
	is_intercept = TRUE;
	}

	ones_rec = matrix (1, rows = num_records, cols = 1);
	linear_terms = X %% beta + ones_rec %% intercept;
	[means, vars] =
	glm_means_and_vars (linear_terms, dist_type, var_power, link_type, link_power);

	if (fileM != " ") {
	write (means, fileM, format=fmtM);
	}

	predicted_y = rowIndexMax(means)
	write(predicted_y, $P, format=fmtM)

	if (fileY != " ")
	{
	Y = read (fileY);
	ones_ctg = matrix (1, rows = ncol(Y), cols = 1);

	# Statistics To Compute:

	Z_logl = NaN;
	Z_logl_pValue = NaN;
	X2_pearson = NaN;
	df_pearson = -1;
	G2_deviance = NaN;
	df_deviance = -1;
	X2_pearson_pValue = NaN;
	G2_deviance_pValue = NaN;
	Z_logl_scaled = NaN;
	Z_logl_scaled_pValue = NaN;
	X2_scaled = NaN;
	X2_scaled_pValue = NaN;
	G2_scaled = NaN;
	G2_scaled_pValue = NaN;

	# set Y_counts to avoid 'Initialization of Y_counts depends on if-else execution' warning
	Y_counts = matrix(0.0, rows=1, cols=1);

	if (dist_type == 1 & link_type == 1) {
	#
	# POWER DISTRIBUTIONS (GAUSSIAN, POISSON, GAMMA, ETC.)
	#
	if (link_power == 0.0) {
	is_zero_Y = (Y == 0.0);
	lt_saturated = log (Y + is_zero_Y) - is_zero_Y / (1.0 - is_zero_Y);
	} else {
	lt_saturated = Y ^ link_power;
	}
	Y_counts = ones_rec;

	X2_pearson = sum ((Y - means) ^ 2 / vars);
	df_pearson = num_records - num_features;

	log_l_part =
	glm_partial_loglikelihood_for_power_dist_and_link (linear_terms, Y, var_power, link_power);
	log_l_part_saturated =
	glm_partial_loglikelihood_for_power_dist_and_link (lt_saturated, Y, var_power, link_power);

	G2_deviance = 2 * sum (log_l_part_saturated) - 2 * sum (log_l_part);
	df_deviance = num_records - num_features;

	} else { if (dist_type >= 2) {
	#
	# BINOMIAL AND MULTINOMIAL DISTRIBUTIONS
	#
	if (ncol (Y) == 1) {
	num_categories = ncol (beta) + 1;
	if (min (Y) <= 0) {
	# Category labels "0", "-1" etc. are converted into the baseline label
	Y = Y + (- Y + num_categories) * (Y <= 0);
	}
	Y_size = min (num_categories, max(Y));
	Y_unsized = table (seq (1, num_records, 1), Y);
	Y = matrix (0, rows = num_records, cols = num_categories);
	Y [, 1 : Y_size] = Y_unsized [, 1 : Y_size];
	Y_counts = ones_rec;
	} else {
	Y_counts = rowSums (Y);
	}

	P = means;
	zero_Y = (Y == 0.0);
	zero_P = (P == 0.0);
	ones_ctg = matrix (1, rows = ncol(Y), cols = 1);

	logl_vec = rowSums (Y * log (P + zero_Y) );
	ent1_vec = rowSums (P * log (P + zero_P) );
	ent2_vec = rowSums (P * (log (P + zero_P))^2);
	E_logl = sum (Y_counts * ent1_vec);
	V_logl = sum (Y_counts * (ent2_vec - ent1_vec ^ 2));
	Z_logl = (sum (logl_vec) - E_logl) / sqrt (V_logl);

	means = means * (Y_counts %*% t(ones_ctg));
	vars = vars * (Y_counts %*% t(ones_ctg));

	frac_below_5 = sum (means < 5) / (nrow (means) * ncol (means));
	frac_below_1 = sum (means < 1) / (nrow (means) * ncol (means));

	if (frac_below_5 > 0.2 \| frac_below_1 > 0.0) {
	print ("WARNING: residual statistics are inaccurate here due to low cell means.");
	}

	X2_pearson = sum ((Y - means) ^ 2 / means);
	df_pearson = (num_records - num_features) * (ncol(Y) - 1);

	G2_deviance = 2 * sum (Y * log ((Y + zero_Y) / (means + zero_Y)));
	df_deviance = (num_records - num_features) * (ncol(Y) - 1);
	}}

	if (Z_logl == Z_logl) {
	Z_logl_absneg = - abs (Z_logl);
	Z_logl_pValue = 2.0 * pnorm(target = Z_logl_absneg);
	}
	if (X2_pearson == X2_pearson & df_pearson > 0) {
	X2_pearson_pValue = pchisq(target = X2_pearson, df = df_pearson, lower.tail=FALSE);
	}
	if (G2_deviance == G2_deviance & df_deviance > 0) {
	G2_deviance_pValue = pchisq(target = G2_deviance, df = df_deviance, lower.tail=FALSE);
	}

	Z_logl_scaled = Z_logl / sqrt (dispersion);
	X2_scaled = X2_pearson / dispersion;
	G2_scaled = G2_deviance / dispersion;

	if (Z_logl_scaled == Z_logl_scaled) {
	Z_logl_scaled_absneg = - abs (Z_logl_scaled);
	Z_logl_scaled_pValue = 2.0 * pnorm(target = Z_logl_scaled_absneg);
	}
	if (X2_scaled == X2_scaled & df_pearson > 0) {
	X2_scaled_pValue = pchisq(target = X2_scaled, df = df_pearson, lower.tail=FALSE);
	}
	if (G2_scaled == G2_scaled & df_deviance > 0) {
	G2_scaled_pValue = pchisq(target = G2_scaled, df = df_deviance, lower.tail=FALSE);
	}

	avg_tot_Y = colSums ( Y ) / sum (Y_counts);
	avg_res_Y = colSums (Y - means) / sum (Y_counts);

	ss_avg_tot_Y = colSums (( Y - Y_counts %*% avg_tot_Y) ^ 2);
	ss_res_Y = colSums ((Y - means) ^ 2);
	ss_avg_res_Y = colSums ((Y - means - Y_counts %*% avg_res_Y) ^ 2);

	df_ss_res_Y = sum (Y_counts) - num_features;
	if (is_intercept) {
	df_ss_avg_res_Y = df_ss_res_Y;
	} else {
	df_ss_avg_res_Y = df_ss_res_Y - 1;
	}

	var_tot_Y = ss_avg_tot_Y / (sum (Y_counts) - 1);
	if (df_ss_avg_res_Y > 0) {
	var_res_Y = ss_avg_res_Y / df_ss_avg_res_Y;
	} else {
	var_res_Y = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
	}
	plain_R2_nobias = 1 - ss_avg_res_Y / ss_avg_tot_Y;
	adjust_R2_nobias = 1 - var_res_Y / var_tot_Y;
	plain_R2 = 1 - ss_res_Y / ss_avg_tot_Y;
	if (df_ss_res_Y > 0) {
	adjust_R2 = 1 - (ss_res_Y / df_ss_res_Y) / var_tot_Y;
	} else {
	adjust_R2 = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
	}

	predicted_avg_var_res_Y = dispersion * colSums (vars) / sum (Y_counts);

	# PREPARING THE OUTPUT CSV STATISTICS FILE

	str = "LOGLHOOD_Z,,FALSE," + Z_logl;
	str = append (str, "LOGLHOOD_Z_PVAL,,FALSE," + Z_logl_pValue);
	str = append (str, "PEARSON_X2,,FALSE," + X2_pearson);
	str = append (str, "PEARSON_X2_BY_DF,,FALSE," + (X2_pearson / df_pearson));
	str = append (str, "PEARSON_X2_PVAL,,FALSE," + X2_pearson_pValue);
	str = append (str, "DEVIANCE_G2,,FALSE," + G2_deviance);
	str = append (str, "DEVIANCE_G2_BY_DF,,FALSE," + (G2_deviance / df_deviance));
	str = append (str, "DEVIANCE_G2_PVAL,,FALSE," + G2_deviance_pValue);
	str = append (str, "LOGLHOOD_Z,,TRUE," + Z_logl_scaled);
	str = append (str, "LOGLHOOD_Z_PVAL,,TRUE," + Z_logl_scaled_pValue);
	str = append (str, "PEARSON_X2,,TRUE," + X2_scaled);
	str = append (str, "PEARSON_X2_BY_DF,,TRUE," + (X2_scaled / df_pearson));
	str = append (str, "PEARSON_X2_PVAL,,TRUE," + X2_scaled_pValue);
	str = append (str, "DEVIANCE_G2,,TRUE," + G2_scaled);
	str = append (str, "DEVIANCE_G2_BY_DF,,TRUE," + (G2_scaled / df_deviance));
	str = append (str, "DEVIANCE_G2_PVAL,,TRUE," + G2_scaled_pValue);

	for (i in 1:ncol(Y)) {
	str = append (str, "AVG_TOT_Y," + i + ",," + as.scalar (avg_tot_Y [1, i]));
	str = append (str, "STDEV_TOT_Y," + i + ",," + as.scalar (sqrt (var_tot_Y [1, i])));
	str = append (str, "AVG_RES_Y," + i + ",," + as.scalar (avg_res_Y [1, i]));
	str = append (str, "STDEV_RES_Y," + i + ",," + as.scalar (sqrt (var_res_Y [1, i])));
	str = append (str, "PRED_STDEV_RES," + i + ",TRUE," + as.scalar (sqrt (predicted_avg_var_res_Y [1, i])));
	str = append (str, "PLAIN_R2," + i + ",," + as.scalar (plain_R2 [1, i]));
	str = append (str, "ADJUSTED_R2," + i + ",," + as.scalar (adjust_R2 [1, i]));
	str = append (str, "PLAIN_R2_NOBIAS," + i + ",," + as.scalar (plain_R2_nobias [1, i]));
	str = append (str, "ADJUSTED_R2_NOBIAS," + i + ",," + as.scalar (adjust_R2_nobias [1, i]));
	}

	if (fileO != " ") {
	write (str, fileO);
	} else {
	print (str);
	}
	}

	glm_means_and_vars =
	function (Matrix[double] linear_terms, int dist_type, double var_power, int link_type, double link_power)
	return (Matrix[double] means, Matrix[double] vars)
	# NOTE: "vars" represents the variance without dispersion, i.e. the V(mu) function.
	{
	num_points = nrow (linear_terms);
	if (dist_type == 1 & link_type == 1) {
	# POWER DISTRIBUTION
	if (link_power == 0.0) {
	y_mean = exp (linear_terms);
	} else { if (link_power == 1.0) {
	y_mean = linear_terms;
	} else { if (link_power == -1.0) {
	y_mean = 1.0 / linear_terms;
	} else {
	y_mean = linear_terms ^ (1.0 / link_power);
	}}}
	if (var_power == 0.0) {
	var_function = matrix (1.0, rows = num_points, cols = 1);
	} else { if (var_power == 1.0) {
	var_function = y_mean;
	} else {
	var_function = y_mean ^ var_power;
	}}
	means = y_mean;
	vars = var_function;
	} else { if (dist_type == 2 & link_type >= 1 & link_type <= 5) {
	# BINOMIAL/BERNOULLI DISTRIBUTION
	y_prob = matrix (0.0, rows = num_points, cols = 2);
	if (link_type == 1 & link_power == 0.0) { # Binomial.log
	y_prob [, 1] = exp (linear_terms);
	y_prob [, 2] = 1.0 - y_prob [, 1];
	} else { if (link_type == 1 & link_power != 0.0) { # Binomial.power_nonlog
	y_prob [, 1] = linear_terms ^ (1.0 / link_power);
	y_prob [, 2] = 1.0 - y_prob [, 1];
	} else { if (link_type == 2) { # Binomial.logit
	elt = exp (linear_terms);
	y_prob [, 1] = elt / (1.0 + elt);
	y_prob [, 2] = 1.0 / (1.0 + elt);
	} else { if (link_type == 3) { # Binomial.probit
	sign_lt = 2 * (linear_terms >= 0.0) - 1;
	t_gp = 1.0 / (1.0 + abs (linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0)
	erf_corr =
	t_gp * ( 0.254829592
	+ t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
	+ t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
	+ t_gp * (-1.453152027
	+ t_gp * 1.061405429)))) * sign_lt * exp (- (linear_terms ^ 2) / 2.0);
	y_prob [, 1] = (1 + sign_lt) - erf_corr;
	y_prob [, 2] = (1 - sign_lt) + erf_corr;
	y_prob = y_prob / 2;
	} else { if (link_type == 4) { # Binomial.cloglog
	elt = exp (linear_terms);
	is_too_small = ((10000000 + elt) == 10000000);
	y_prob [, 2] = exp (- elt);
	y_prob [, 1] = (1 - is_too_small) * (1.0 - y_prob [, 2]) + is_too_small * elt * (1.0 - elt / 2);
	} else { if (link_type == 5) { # Binomial.cauchit
	atan_linear_terms = atan (linear_terms);
	y_prob [, 1] = 0.5 + atan_linear_terms / pi;
	y_prob [, 2] = 0.5 - atan_linear_terms / pi;
	}}}}}}
	means = y_prob;
	ones_ctg = matrix (1, rows = 2, cols = 1);
	vars = means * (means %*% (1 - diag (ones_ctg)));
	} else { if (dist_type == 3) {
	# MULTINOMIAL LOGIT DISTRIBUTION
	elt = exp (linear_terms);
	ones_pts = matrix (1, rows = num_points, cols = 1);
	elt = cbind (elt, ones_pts);
	ones_ctg = matrix (1, rows = ncol (elt), cols = 1);
	means = elt / (rowSums (elt) %*% t(ones_ctg));
	vars = means * (means %*% (1 - diag (ones_ctg)));
	} else {
	means = matrix (0.0, rows = num_points, cols = 1);
	vars = matrix (0.0, rows = num_points, cols = 1);
	} }}}

	glm_partial_loglikelihood_for_power_dist_and_link = # Assumes: dist_type == 1 & link_type == 1
	function (Matrix[double] linear_terms, Matrix[double] Y, double var_power, double link_power)
	return (Matrix[double] log_l_part)
	{
	num_records = nrow (Y);
	if (var_power == 1.0) { # Poisson
	if (link_power == 0.0) { # Poisson.log
	is_natural_parameter_log_zero = (linear_terms == -Inf);
	natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
	b_cumulant = exp (linear_terms);
	} else { # Poisson.power_nonlog
	is_natural_parameter_log_zero = (linear_terms == 0.0);
	natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
	b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
	}
	is_minus_infinity = (Y > 0) * is_natural_parameter_log_zero;
	log_l_part = Y * natural_parameters - b_cumulant - is_minus_infinity / (1 - is_minus_infinity);
	} else {
	if (var_power == 2.0 & link_power == 0.0) { # Gamma.log
	natural_parameters = - exp (- linear_terms);
	b_cumulant = linear_terms;
	} else { if (var_power == 2.0) { # Gamma.power_nonlog
	natural_parameters = - linear_terms ^ (- 1.0 / link_power);
	b_cumulant = log (linear_terms) / link_power;
	} else { if (link_power == 0.0) { # PowerDist.log
	natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
	b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
	} else { # PowerDist.power_nonlog
	power_np = (1.0 - var_power) / link_power;
	natural_parameters = (linear_terms ^ power_np) / (1.0 - var_power);
	power_cu = (2.0 - var_power) / link_power;
	b_cumulant = (linear_terms ^ power_cu) / (2.0 - var_power);
	}}}
	log_l_part = Y * natural_parameters - b_cumulant;
	} }