scripts/algorithms/StepGLM.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC
 # EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME  TYPE     DEFAULT   MEANING
 # ---------------------------------------------------------------------------------------------
 # X     String    ---      Location to read the matrix X of feature vectors
 # Y     String    ---      Location to read response matrix Y with 1 column
 # B     String    ---      Location to store estimated regression parameters (the betas)
 # S     String    ---      Location to write the selected features ordered as computed by the algorithm
 # O     String    " "      Location to write the printed statistics; by default is standard output
 # link  Int       2        Link function code: 1 = log, 2 = Logit, 3 = Probit, 4 = Cloglog
 # yneg  Double    0.0      Response value for Bernoulli "No" label, usually 0.0 or -1.0
 # icpt  Int       0        Intercept presence, X columns shifting and rescaling:
 #                          0 = no intercept, no shifting, no rescaling;
 #                          1 = add intercept, but neither shift nor rescale X;
 #                          2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # tol   Double    0.000001 Tolerance (epsilon)
 # disp  Double    0.0      (Over-)dispersion value, or 0.0 to estimate it from data
 # moi   Int       200      Maximum number of outer (Newton / Fisher Scoring) iterations
 # mii   Int       0        Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum
 # thr   Double    0.01     Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
 #                          no further features are being checked and the algorithm stops
 # fmt   String   "text"    The betas matrix output format, such as "text" or "csv"
 # ---------------------------------------------------------------------------------------------
 # OUTPUT: Matrix beta, whose size depends on icpt:
 #     icpt=0: ncol(X) x 1;  icpt=1: (ncol(X) + 1) x 1;  icpt=2: (ncol(X) + 1) x 2
 #
 # In addition, in the last run of GLM some statistics are provided in CSV format, one comma-separated name-value
 # pair per each line, as follows:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------------
 # TERMINATION_CODE      A positive integer indicating success/failure as follows:
 #                       1 = Converged successfully; 2 = Maximum number of iterations reached;
 #                       3 = Input (X, Y) out of range; 4 = Distribution/link is not supported
 # BETA_MIN              Smallest beta value (regression coefficient), excluding the intercept
 # BETA_MIN_INDEX        Column index for the smallest beta value
 # BETA_MAX              Largest beta value (regression coefficient), excluding the intercept
 # BETA_MAX_INDEX        Column index for the largest beta value
 # INTERCEPT             Intercept value, or NaN if there is no intercept (if icpt=0)
 # DISPERSION            Dispersion used to scale deviance, provided as "disp" input parameter
 #                       or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0
 # DISPERSION_EST        Dispersion estimated from the dataset
 # DEVIANCE_UNSCALED     Deviance from the saturated model, assuming dispersion == 1.0
 # DEVIANCE_SCALED       Deviance from the saturated model, scaled by the DISPERSION value
 # -------------------------------------------------------------------------------------------
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemDS.jar -f StepGLM.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
 # 			 S=OUTPUT_DIR_S/selected O=OUTPUT_DIR/stats link=2 yneg=-1.0 icpt=2 tol=0.00000001
 #			 disp=1.0 moi=100 mii=10 thr=0.01 fmt=csv
 #
 # THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE FOLLOWING LINK FUNCTIONS ONLY!
 # 	- LOG
 #	- LOGIT
 #	- PROBIT
 #	- CLOGLOG

 fileX = $X;
 fileY = $Y;
 fileB = $B;
 intercept_status = ifdef ($icpt, 0);
 thr = ifdef ($thr, 0.01);
 bernoulli_No_label = ifdef ($yneg, 0.0);    # $yneg = 0.0;
 distribution_type = 2;

 bernoulli_No_label = as.double (bernoulli_No_label);

 # currently only the forward selection strategy in supported: start from one feature and iteratively add
 # features until AIC improves
 dir = "forward";

 print("BEGIN STEPWISE GLM SCRIPT");
 print ("Reading X and Y...");
 X_orig = read (fileX);
 Y = read (fileY);

 if (distribution_type == 2 & ncol(Y) == 1) {
 	is_Y_negative = (Y == bernoulli_No_label);
 	Y = cbind (1 - is_Y_negative, is_Y_negative);
 	count_Y_negative = sum (is_Y_negative);
 	if (count_Y_negative == 0) {
 		stop ("StepGLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label");
 	}
 	if (count_Y_negative == nrow(Y)) {
 		stop ("StepGLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label");
 	}
 }

 num_records = nrow (X_orig);
 num_features = ncol (X_orig);

 # BEGIN STEPWISE GENERALIZED LINEAR MODELS

 if (dir == "forward") {

 	continue = TRUE;
 	columns_fixed = matrix (0, rows = 1, cols = num_features);
 	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);

 	# X_global stores the best model found at each step
 	X_global = matrix (0, rows = num_records, cols = 1);

 	if (intercept_status == 0) {
 		# Compute AIC of an empty model with no features and no intercept (all Ys are zero)
 		[AIC_best] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered, " ");
 	} else {
 		# compute AIC of an empty model with only intercept (all Ys are constant)
 		all_ones = matrix (1, rows = num_records, cols = 1);
 		[AIC_best] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered, " ");
 	}
 	print ("Best AIC without any features: " + AIC_best);

 	# First pass to examine single features
 	AICs = matrix (AIC_best, rows = 1, cols = num_features);
 	parfor (i in 1:num_features) {
 		[AIC_1] = glm_fit (X_orig[,i], Y, intercept_status, num_features, columns_fixed_ordered, " ");
 		AICs[1,i] = AIC_1;
 	}

 	# Determine the best AIC
 	column_best = 0;
 	for (k in 1:num_features) {
 		AIC_cur = as.scalar (AICs[1,k]);
 		if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
 			column_best = k;
 			AIC_best = as.scalar(AICs[1,k]);
 		}
 	}

 	if (column_best == 0) {
 		print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
 		if (intercept_status == 0) {
 			# Compute AIC of an empty model with no features and no intercept (all Ys are zero)
 			[AIC_best] = glm_fit (X_global, Y, 0, num_features, columns_fixed_ordered, fileB);
 		} else {
 			# compute AIC of an empty model with only intercept (all Ys are constant)
 			###all_ones = matrix (1, rows = num_records, cols = 1);
 			[AIC_best] = glm_fit (all_ones, Y, 0, num_features, columns_fixed_ordered, fileB);
 		}
 	};

 	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
 	columns_fixed[1,column_best] = 1;
 	columns_fixed_ordered[1,1] = column_best;
 	X_global = X_orig[,column_best];

 	while (continue) {
 		# Subsequent passes over the features
 		parfor (i in 1:num_features) {
 			if (as.scalar(columns_fixed[1,i]) == 0) {

 				# Construct the feature matrix
 				X = cbind (X_global, X_orig[,i]);

 				[AIC_2] = glm_fit (X, Y, intercept_status, num_features, columns_fixed_ordered, " ");
 				AICs[1,i] = AIC_2;
 			}
 		}

 		# Determine the best AIC
 		for (k in 1:num_features) {
 			AIC_cur = as.scalar (AICs[1,k]);
 			if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
 				column_best = k;
 				AIC_best = as.scalar(AICs[1,k]);
 			}
 		}

 		# cbind best found features (i.e., columns) to X_global
 		if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
 			print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
 			columns_fixed[1,column_best] = 1;
 			columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best));
 			if (ncol(columns_fixed_ordered) == num_features) { # all features examined
 				X_global = cbind (X_global, X_orig[,column_best]);
 				continue = FALSE;
 			} else {
 				X_global = cbind (X_global, X_orig[,column_best]);
 			}
 		} else {
 		continue = FALSE;
 		}
 	}

 	# run GLM with selected set of features
 	print ("Running GLM with selected features...");
 	[AIC] = glm_fit (X_global, Y, intercept_status, num_features, columns_fixed_ordered, fileB);

 } else {
 	stop ("Currently only forward selection strategy is supported!");
 }


 ################### UDFS USED IN THIS SCRIPT ##################

 glm_fit = function (Matrix[Double] X, Matrix[Double] Y, Int intercept_status, Double num_features_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {

 	# distribution family code: 1 = Power, 2 = Bernoulli/Binomial; currently only Bernouli distribution family is supported!
 	distribution_type = 2;                		# $dfam = 2;
 	variance_as_power_of_the_mean = 0.0;  		# $vpow = 0.0;
 	# link function code: 0 = canonical (depends on distribution), 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit;
 	# currently only log (link = 1), logit (link = 2), probit (link = 3), and cloglog (link = 4) are supported!
 	link_type = ifdef ($link, 2);         		# $link = 2;
 	link_as_power_of_the_mean = 0.0;      		# $lpow = 0.0;

 	dispersion = ifdef ($disp, 0.0);            # $disp = 0.0;
 	eps = ifdef ($tol, 0.000001);               # $tol  = 0.000001;
 	max_iteration_IRLS = ifdef ($moi, 200);     # $moi  = 200;
 	max_iteration_CG = ifdef ($mii, 0);         # $mii  = 0;

 	variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean);
 	link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);

 	dispersion = as.double (dispersion);
 	eps = as.double (eps);

 	# Default values for output statistics:
 	regularization = 0.0;
 	termination_code     = 0.0;
 	min_beta             = NaN;
 	i_min_beta           = NaN;
 	max_beta             = NaN;
 	i_max_beta           = NaN;
 	intercept_value      = NaN;
 	dispersion           = NaN;
 	estimated_dispersion = NaN;
 	deviance_nodisp      = NaN;
 	deviance             = NaN;

 	#####   INITIALIZE THE PARAMETERS   #####

     num_records  = nrow (X);
     num_features = ncol (X);
     zeros_r = matrix (0, rows = num_records, cols = 1);
     ones_r = 1 + zeros_r;

     # Introduce the intercept, shift and rescale the columns of X if needed

     if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
 		X = cbind (X, ones_r);
         num_features = ncol (X);
 	}

     scale_lambda = matrix (1, rows = num_features, cols = 1);
     if (intercept_status == 1 | intercept_status == 2) {
 		scale_lambda [num_features, 1] = 0;
     }

     if (intercept_status == 2) {  # scale-&-shift X columns to mean 0, variance 1
 		# Important assumption: X [, num_features] = ones_r
         avg_X_cols = t(colSums(X)) / num_records;
         var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1);
         is_unsafe = (var_X_cols <= 0);
         scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
         scale_X [num_features, 1] = 1;
         shift_X = - avg_X_cols * scale_X;
         shift_X [num_features, 1] = 0;
         rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2);
 	} else {
         scale_X = matrix (1, rows = num_features, cols = 1);
         shift_X = matrix (0, rows = num_features, cols = 1);
         rowSums_X_sq = rowSums (X ^ 2);
     }

     # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2)
     # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale.
     # The transform is then associatively applied to the other side of the expression,
     # and is rewritten via "scale_X" and "shift_X" as follows:
     #
     # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
     # ssX_A  = diag (scale_X) %*% A;
     # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
     #
     # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
     # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];

     # Initialize other input-dependent parameters

     lambda = scale_lambda * regularization;
     if (max_iteration_CG == 0) {
 		max_iteration_CG = num_features;
     }

     # Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const]

     if (link_type == 0) {
 		if (distribution_type == 1) {
 			link_type = 1;
             link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
 		} else {
 			if (distribution_type == 2) {
 				link_type = 2;
 			}
 		}
 	}

     # For power distributions and/or links, we use two constants,
     # "variance as power of the mean" and "link_as_power_of_the_mean",
     # to specify the variance and the link as arbitrary powers of the
     # mean.  However, the variance-powers of 1.0 (Poisson family) and
     # 2.0 (Gamma family) have to be treated as special cases, because
     # these values integrate into logarithms.  The link-power of 0.0
     # is also special as it represents the logarithm link.

     num_response_columns = ncol (Y);
     is_supported = 0;
 	if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 & link_type <= 4) { # BERNOULLI DISTRIBUTION
 		is_supported = 1;
 	}
 	if (num_response_columns == 1 & distribution_type == 2) {
 		print ("Error: Bernoulli response matrix has not been converted into two-column format.");
     }

 	if (is_supported == 1) {

 		#####   INITIALIZE THE BETAS   #####

         [beta, saturated_log_l, isNaN] =
 			glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG);

 		# print(" --- saturated logLik " + saturated_log_l);

         if (isNaN == 0) {

 			#####  START OF THE MAIN PART  #####

             sum_X_sq = sum (rowSums_X_sq);
             trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq));
             ###  max_trust_delta = trust_delta * 10000.0;
             log_l = 0.0;
             deviance_nodisp = 0.0;
             new_deviance_nodisp = 0.0;
             isNaN_log_l = 2;
             newbeta = beta;
             g = matrix (0.0, rows = num_features, cols = 1);
             g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
             accept_new_beta = 1;
             reached_trust_boundary = 0;
             neg_log_l_change_predicted = 0.0;
             i_IRLS = 0;

             # print ("BEGIN IRLS ITERATIONS...");

             ssX_newbeta = diag (scale_X) %*% newbeta;
             ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
             all_linear_terms = X %*% ssX_newbeta;

             [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
 				(all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);

             if (isNaN_new_log_l == 0) {
 				new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
                 new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
             }

             while (termination_code == 0) {
 				accept_new_beta = 1;

                 if (i_IRLS > 0) {
 					if (isNaN_log_l == 0) {
 						accept_new_beta = 0;
                     }

                     # Decide whether to accept a new iteration point and update the trust region
                     # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright

                     rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
 					if (rho < 0.25 | isNaN_new_log_l == 1) {
 						trust_delta = 0.25 * trust_delta;
 					}
 					if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) {
 						trust_delta = 2 * trust_delta;

 						### if (trust_delta > max_trust_delta) {
 						###     trust_delta = max_trust_delta;
 						### }
 					}
 					if (rho > 0.1 & isNaN_new_log_l == 0) {
 						accept_new_beta = 1;
 					}
 				}

                 if (accept_new_beta == 1) {
 					beta = newbeta;  log_l = new_log_l;  deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;

 					[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);

 					# We introduced these variables to avoid roundoff errors:
 					#     g_Y = y_residual / (y_var * link_grad);
 					#     w   = 1.0 / (y_var * link_grad * link_grad);

 					gXY = - t(X) %*% g_Y;
 					g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ];
 					g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
 				}

                 [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] =
 					get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);

 				newbeta = beta + z;

 				ssX_newbeta = diag (scale_X) %*% newbeta;
 				ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta;
 				all_linear_terms = X %*% ssX_newbeta;

 				[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
 					(all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);

 				if (isNaN_new_log_l == 0) {
 					new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l);
 					new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
 				}

 				log_l_change = new_log_l - log_l;               # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps

 				if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 &
 				   (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) {
 					termination_code = 1;
 				}
 				rho = - log_l_change / neg_log_l_change_predicted;
 				z_norm = sqrt (sum (z * z));

 				i_IRLS = i_IRLS + 1;

 				if (i_IRLS == max_iteration_IRLS) {
 					termination_code = 2;
 				}
 			}

             beta = newbeta;
             log_l = new_log_l;
             deviance_nodisp = new_deviance_nodisp;

             #---------------------------- last part

 			if (termination_code != 1) {
 				print ("One of the runs of GLM did not converged in " + i_IRLS + " steps!");
 			}

             ##### COMPUTE AIC #####

 			if (distribution_type == 2 & link_type >= 1 & link_type <= 4) {
 				AIC = -2 * log_l;
 				if (sum (X) != 0) {
 					AIC = AIC + 2 * num_features;
 				}
 			} else {
 				stop ("Currently only the Bernoulli distribution family the following link functions are supported: log, logit, probit, and cloglog!");
 			}

             if (fileB != " ") {
 				fileO = ifdef ($O, " ");
 				fileS = $S;
 				fmt  = ifdef ($fmt, "text");

 				# Output which features give the best AIC and are being used for linear regression
 				write (Selected, fileS, format=fmt);

 				ssX_beta = diag (scale_X) %*% beta;
                 ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta;
 				if (intercept_status == 2) {
 					beta_out = cbind (ssX_beta, beta);
                 } else {
 					beta_out = ssX_beta;
                 }

                 if (intercept_status == 0 & num_features == 1) {
 					p = sum (X == 1);
 					if (p == num_records) {
 						beta_out = beta_out[1,];
 					}
                 }


                 if (intercept_status == 1 | intercept_status == 2) {
 					intercept_value = as.scalar (beta_out [num_features, 1]);
                     beta_noicept = beta_out [1 : (num_features - 1), 1];
                 } else {
 					beta_noicept = beta_out [1 : num_features, 1];
                 }
                 min_beta = min (beta_noicept);
                 max_beta = max (beta_noicept);
                 tmp_i_min_beta = rowIndexMin (t(beta_noicept))
                 i_min_beta = as.scalar (tmp_i_min_beta [1, 1]);
                 tmp_i_max_beta = rowIndexMax (t(beta_noicept))
                 i_max_beta = as.scalar (tmp_i_max_beta [1, 1]);

                 #####  OVER-DISPERSION PART  #####

                 all_linear_terms = X %*% ssX_beta;
 				[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);

                 pearson_residual_sq = g_Y ^ 2 / w;
                 pearson_residual_sq = replace (target = pearson_residual_sq, pattern = NaN, replacement = 0);
                 # pearson_residual_sq = (y_residual ^ 2) / y_var;

                 if (num_records > num_features) {
 					estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features);
                 }
                 if (dispersion <= 0) {
 					dispersion = estimated_dispersion;
                 }
                 deviance = deviance_nodisp / dispersion;

                 #####  END OF THE MAIN PART  #####

                 str = "BETA_MIN," + min_beta;
                 str = append (str, "BETA_MIN_INDEX," + i_min_beta);
                 str = append (str, "BETA_MAX," + max_beta);
                 str = append (str, "BETA_MAX_INDEX," + i_max_beta);
                 str = append (str, "INTERCEPT," + intercept_value);
                 str = append (str, "DISPERSION," + dispersion);
                 str = append (str, "DISPERSION_EST," + estimated_dispersion);
                 str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
                 str = append (str, "DEVIANCE_SCALED," + deviance);

                 if (fileO != " ") {
 					write (str, fileO);
                 }
 				else {
 					print (str);
                 }

 				# Prepare the output matrix
 				print ("Writing the output matrix...");
                 if (intercept_status == 0 & num_features == 1) {
 					if (p == num_records) {
 						beta_out_tmp = matrix (0, rows = num_features_orig + 1, cols = 1);
 						beta_out_tmp[num_features_orig + 1,] = beta_out;
 						beta_out = beta_out_tmp;
 						write (beta_out, fileB, format=fmt);
 						stop ("");
 					} else if (sum (X) == 0){
 						beta_out = matrix (0, rows = num_features_orig, cols = 1);
 						write (beta_out, fileB, format=fmt);
 						stop ("");
 					}
 				}

 				no_selected = ncol (Selected);
 				max_selected = max (Selected);
 				last = max_selected + 1;

 				if (intercept_status != 0) {

 					Selected_ext = cbind (Selected, as.matrix (last));
 					P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));

 					if (intercept_status == 2) {

 						P1_ssX_beta = P1 * ssX_beta;
 						P2_ssX_beta = colSums (P1_ssX_beta);
 						P1_beta = P1 * beta;
 						P2_beta = colSums (P1_beta);

 						if (max_selected < num_features_orig) {

 							P2_ssX_beta = cbind (P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
 							P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));

 							P2_ssX_beta[1, num_features_orig+1] = P2_ssX_beta[1, max_selected + 1];
 							P2_ssX_beta[1, max_selected + 1] = 0;

 							P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1];
 							P2_beta[1, max_selected + 1] = 0;

 						}
 						beta_out = cbind (t(P2_ssX_beta), t(P2_beta));

 					} else {

 						P1_beta = P1 * beta;
 						P2_beta = colSums (P1_beta);

 						if (max_selected < num_features_orig) {
 							P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
 							P2_beta[1, num_features_orig+1] = P2_beta[1, max_selected + 1] ;
 							P2_beta[1, max_selected + 1] = 0;
 						}
 						beta_out = t(P2_beta);

 					}
 				} else {

 					P1 = table (seq (1, no_selected), t(Selected));
 					P1_beta = P1 * beta;
 					P2_beta = colSums (P1_beta);

 					if (max_selected < num_features_orig) {
 						P2_beta = cbind (P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
 					}

 					beta_out = t(P2_beta);
 				}

 				write ( beta_out, fileB, format=fmt );

 			}

 		} else {
 			stop ("Input matrices X and/or Y are out of range!");
         }
 	} else {
 		stop ("Response matrix with " + num_response_columns + " columns, distribution family (" + distribution_type + ", " + variance_as_power_of_the_mean
                + ") and link family (" + link_type + ", " + link_as_power_of_the_mean + ") are NOT supported together.");
     }
 }

 glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG)
   return (Matrix[double] beta, double saturated_log_l, int isNaN)
 {
     saturated_log_l = 0.0;
     isNaN = 0;
     y_corr = Y [, 1];
     if (dist_type == 2) {
       n_corr = rowSums (Y);
       is_n_zero = (n_corr == 0);
       y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;
     }
     linear_terms = y_corr;
     if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
       if          (link_power ==  0) {
         if (sum (y_corr < 0) == 0) {
           is_zero_y_corr = (y_corr == 0);
           linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
         } else { isNaN = 1; }
       } else { if (link_power ==  1.0) {
         linear_terms = y_corr;
       } else { if (link_power == -1.0) {
         linear_terms = 1.0 / y_corr;
       } else { if (link_power ==  0.5) {
         if (sum (y_corr < 0) == 0) {
           linear_terms = sqrt (y_corr);
         } else { isNaN = 1; }
       } else { if (link_power >   0) {
         if (sum (y_corr < 0) == 0) {
           is_zero_y_corr = (y_corr == 0);
           linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
         } else { isNaN = 1; }
       } else {
         if (sum (y_corr <= 0) == 0) {
           linear_terms = y_corr ^ link_power;
         } else { isNaN = 1; }
       }}}}}
     }
     if (dist_type == 2 & link_type >= 1 & link_type <= 5)
     { # BINOMIAL/BERNOULLI DISTRIBUTION
       if          (link_type == 1 & link_power == 0)  { # Binomial.log
         if (sum (y_corr < 0) == 0) {
           is_zero_y_corr = (y_corr == 0);
           linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
         } else { isNaN = 1; }
       } else { if (link_type == 1 & link_power >  0)  { # Binomial.power_nonlog pos
         if (sum (y_corr < 0) == 0) {
           is_zero_y_corr = (y_corr == 0);
           linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr;
         } else { isNaN = 1; }
       } else { if (link_type == 1)                      { # Binomial.power_nonlog neg
         if (sum (y_corr <= 0) == 0) {
           linear_terms = y_corr ^ link_power;
         } else { isNaN = 1; }
       } else {
         is_zero_y_corr = (y_corr <= 0);
         is_one_y_corr  = (y_corr >= 1.0);
         y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr);
         if (link_type == 2)                           { # Binomial.logit
           linear_terms = log (y_corr / (1.0 - y_corr))
           + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
         } else { if (link_type == 3)                  { # Binomial.probit
           y_below_half = y_corr + (1.0 - 2.0 * y_corr) * (y_corr > 0.5);
           t = sqrt (- 2.0 * log (y_below_half));
           approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
           linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * (y_corr > 0.5))
           + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
         } else { if (link_type == 4)                  { # Binomial.cloglog
           linear_terms = log (- log (1.0 - y_corr))
           - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
           + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
         } else { if (link_type == 5)                  { # Binomial.cauchit
           linear_terms = tan ((y_corr - 0.5) * pi)
           + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr);
         }}  }}}}}
     }

     if (isNaN == 0) {
       [saturated_log_l, isNaN] =
         glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power);
     }

     if ((dist_type == 1 & link_type == 1 & link_power == 0) |
           (dist_type == 2 & link_type >= 2))
     {
       desired_eta = 0.0;
     } else { if (link_type == 1 & link_power == 0) {
       desired_eta = log (0.5);
     } else { if (link_type == 1) {
       desired_eta = 0.5 ^ link_power;
     } else {
       desired_eta = 0.5;
     }}}

     beta = matrix (0.0, rows = ncol(X), cols = 1);

     if (desired_eta != 0) {
       if (icept_status == 1 | icept_status == 2) {
         beta [nrow(beta), 1] = desired_eta;
       } else {
         # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
         # Note that "ssX_transform" is trivial here, hence ignored

         beta = straightenX (X, 0.000001, max_iter_CG);
         beta = beta * desired_eta;
       }   }   }


 glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
                      int dist_type, double var_power, int link_type, double link_power)
   return (Matrix[double] g_Y, Matrix[double] w)
 # ORIGINALLY we returned more meaningful vectors, namely:
 # Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
 # Matrix[double] link_gradient : derivative of the link function
 # Matrix[double] var_function  : variance without dispersion, i.e. the V(mu) function
 # BUT, this caused roundoff errors, so we had to compute "directly useful" vectors
 # and skip over the "meaningful intermediaries".  Now we output these two variables:
 #     g_Y = y_residual / (var_function * link_gradient);
 #     w   = 1.0 / (var_function * link_gradient ^ 2);
 {
     num_records = nrow (linear_terms);
     zeros_r = matrix (0.0, rows = num_records, cols = 1);
     ones_r = 1 + zeros_r;
     g_Y  = zeros_r;
     w  = zeros_r;

     # Some constants

     one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
     ones_2 = matrix (1.0, rows = 1, cols = 2);
     p_one_m_one = ones_2;
     p_one_m_one [1, 2] = -1.0;
     m_one_p_one = ones_2;
     m_one_p_one [1, 1] = -1.0;
     zero_one = ones_2;
     zero_one [1, 1] = 0.0;
     one_zero = ones_2;
     one_zero [1, 2] = 0.0;
     flip_pos = matrix (0, rows = 2, cols = 2);
     flip_neg = flip_pos;
     flip_pos [1, 2] = 1;
     flip_pos [2, 1] = 1;
     flip_neg [1, 2] = -1;
     flip_neg [2, 1] = 1;

     if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
       y_mean = zeros_r;
       if          (link_power ==  0) {
         y_mean = exp (linear_terms);
         y_mean_pow = y_mean ^ (1 - var_power);
         w   = y_mean_pow * y_mean;
         g_Y = y_mean_pow * (Y - y_mean);
       } else { if (link_power ==  1.0) {
         y_mean = linear_terms;
         w   = y_mean ^ (- var_power);
         g_Y = w * (Y - y_mean);
       } else {
         y_mean = linear_terms ^ (1.0 / link_power);
         c1  = (1 - var_power) / link_power - 1;
         c2  = (2 - var_power) / link_power - 2;
         g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
         w   = (linear_terms ^ c2) / (link_power ^ 2);
       }   }}
     if (dist_type == 2 & link_type >= 1 & link_type <= 5)
     { # BINOMIAL/BERNOULLI DISTRIBUTION
       if (link_type == 1) { # BINOMIAL.POWER LINKS
         if (link_power == 0)  { # Binomial.log
           vec1 = 1 / (exp (- linear_terms) - 1);
           g_Y = Y [, 1] - Y [, 2] * vec1;
           w   = rowSums (Y) * vec1;
         } else {                  # Binomial.nonlog
           vec1 = zeros_r;
           if (link_power == 0.5)  {
             vec1 = 1 / (1 - linear_terms ^ 2);
           } else { if (sum (linear_terms < 0) == 0) {
             vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power));
           } else {isNaN = 1;}}
           # We want a "zero-protected" version of
           #     vec2 = Y [, 1] / linear_terms;
           is_y_0 = (Y [, 1] == 0);
           vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0;
           g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
           w   =  rowSums (Y) * vec1 / link_power ^ 2;
         }
       } else {
         is_LT_pos_infinite = (linear_terms == Inf);
         is_LT_neg_infinite = (linear_terms == -Inf);
         is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
         finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
         finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
         if (link_type == 2)                           { # Binomial.logit
           Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
           Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
           Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
           g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = y_residual;
           w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = y_variance;
         } else { if (link_type == 3)                  { # Binomial.probit
           is_lt_pos = (linear_terms >= 0);
           t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
           pt_gp = t_gp * ( 0.254829592
                            + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
                                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
                                                 + t_gp * (-1.453152027
                                                           + t_gp *   1.061405429))));
           the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
           vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
           vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5);
           w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1;
           g_Y = one_over_sqrt_two_pi * vec2 / vec1;
         } else { if (link_type == 4)                  { # Binomial.cloglog
           the_exp = exp (linear_terms)
           the_exp_exp = exp (- the_exp);
           is_too_small = ((10000000 + the_exp) == 10000000);
           the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2);
           g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
           w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
         } else { if (link_type == 5)                  { # Binomial.cauchit
           Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
           Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
           y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
           var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
           link_gradient_normalized = (1 + linear_terms ^ 2) * pi;
           g_Y =  rowSums (Y) * y_residual / (var_function * link_gradient_normalized);
           w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2);
         }}}}
       }
     }
   }


 glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y,
                                     int dist_type, double var_power, int link_type, double link_power)
   return (double log_l, int isNaN)
 {
     isNaN = 0;
     log_l = 0.0;
     num_records = nrow (Y);
     zeros_r = matrix (0.0, rows = num_records, cols = 1);

     if (dist_type == 1 & link_type == 1)
     { # POWER DISTRIBUTION
       b_cumulant = zeros_r;
       natural_parameters = zeros_r;
       is_natural_parameter_log_zero = zeros_r;
       if          (var_power == 1.0 & link_power == 0)  { # Poisson.log
         b_cumulant = exp (linear_terms);
         is_natural_parameter_log_zero = (linear_terms == -Inf);
         natural_parameters = replace (target = linear_terms, pattern = -Inf, replacement = 0);
       } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
         if (sum (linear_terms < 0) == 0)  {
           b_cumulant = linear_terms;
           is_natural_parameter_log_zero = (linear_terms == 0);
           natural_parameters = log (linear_terms + is_natural_parameter_log_zero);
         } else {isNaN = 1;}
       } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
         if (sum (linear_terms < 0) == 0)  {
           b_cumulant = linear_terms ^ 2;
           is_natural_parameter_log_zero = (linear_terms == 0);
           natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero);
         } else {isNaN = 1;}
       } else { if (var_power == 1.0 & link_power  > 0)  { # Poisson.power_nonlog, pos
         if (sum (linear_terms < 0) == 0)  {
           is_natural_parameter_log_zero = (linear_terms == 0);
           b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero;
           natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power;
         } else {isNaN = 1;}
       } else { if (var_power == 1.0)                      { # Poisson.power_nonlog, neg
         if (sum (linear_terms <= 0) == 0) {
           b_cumulant = linear_terms ^ (1.0 / link_power);
           natural_parameters = log (linear_terms) / link_power;
         } else {isNaN = 1;}
       } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
         if (sum (linear_terms <= 0) == 0) {
           b_cumulant = - log (linear_terms);
           natural_parameters = - linear_terms;
         } else {isNaN = 1;}
       } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
         if (sum (linear_terms <= 0) == 0) {
           b_cumulant = log (linear_terms);
           natural_parameters = - 1.0 / linear_terms;
         } else {isNaN = 1;}
       } else { if (var_power == 2.0 & link_power ==  0) { # Gamma.log
         b_cumulant = linear_terms;
         natural_parameters = - exp (- linear_terms);
       } else { if (var_power == 2.0)                      { # Gamma.power_nonlog
         if (sum (linear_terms <= 0) == 0) {
           b_cumulant = log (linear_terms) / link_power;
           natural_parameters = - linear_terms ^ (- 1.0 / link_power);
         } else {isNaN = 1;}
       } else { if                    (link_power ==  0) { # PowerDist.log
         natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power);
         b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
       } else {                                              # PowerDist.power_nonlog
         if          (-2 * link_power == 1.0 - var_power) {
           natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
         } else { if (-1 * link_power == 1.0 - var_power) {
           natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
         } else { if (     link_power == 1.0 - var_power) {
           natural_parameters = linear_terms / (1.0 - var_power);
         } else { if ( 2 * link_power == 1.0 - var_power) {
           natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
         } else {
           if (sum (linear_terms <= 0) == 0) {
             power = (1.0 - var_power) / link_power;
             natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
           } else {isNaN = 1;}
         }}}}
         if          (-2 * link_power == 2.0 - var_power) {
           b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
         } else { if (-1 * link_power == 2.0 - var_power) {
           b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
         } else { if (     link_power == 2.0 - var_power) {
           b_cumulant = linear_terms / (2.0 - var_power);
         } else { if ( 2 * link_power == 2.0 - var_power) {
           b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
         } else {
           if (sum (linear_terms <= 0) == 0) {
             power = (2.0 - var_power) / link_power;
             b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
           } else {isNaN = 1;}
         }}}}
       }}}}} }}}}}
       if (sum (is_natural_parameter_log_zero * abs (Y)) > 0) {
         log_l = -Inf;
         isNaN = 1;
       }
       if (isNaN == 0)
       {
         log_l = sum (Y * natural_parameters - b_cumulant);
         if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
           log_l = -Inf;
           isNaN = 1;
         }   }   }

     if (dist_type == 2 & link_type >= 1 & link_type <= 5)
     { # BINOMIAL/BERNOULLI DISTRIBUTION

       [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power);

       if (isNaN == 0) {
         does_prob_contradict = (Y_prob <= 0);
         if (sum (does_prob_contradict * abs (Y)) == 0) {
           log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict));
           if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
             isNaN = 1;
           }
         } else {
           log_l = -Inf;
           isNaN = 1;
         }   }   }

     if (isNaN == 1) {
       log_l = - Inf;
     }
   }


 binomial_probability_two_column =
   function (Matrix[double] linear_terms, int link_type, double link_power)
     return   (Matrix[double] Y_prob, int isNaN)
 {
       isNaN = 0;
       num_records = nrow (linear_terms);

       # Define some auxiliary matrices

       ones_2 = matrix (1.0, rows = 1, cols = 2);
       p_one_m_one = ones_2;
       p_one_m_one [1, 2] = -1.0;
       m_one_p_one = ones_2;
       m_one_p_one [1, 1] = -1.0;
       zero_one = ones_2;
       zero_one [1, 1] = 0.0;
       one_zero = ones_2;
       one_zero [1, 2] = 0.0;

       zeros_r = matrix (0.0, rows = num_records, cols = 1);
       ones_r = 1.0 + zeros_r;

       # Begin the function body

       Y_prob = zeros_r %*% ones_2;
       if (link_type == 1) { # Binomial.power
         if          (link_power == 0) { # Binomial.log
           Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;
         } else { if (link_power == 0.5) { # Binomial.sqrt
           Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;
         } else {                          # Binomial.power_nonlog
           if (sum (linear_terms < 0) == 0) {
             Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one;
           } else {isNaN = 1;}
         }}
       } else {              # Binomial.non_power
         is_LT_pos_infinite = (linear_terms == Inf);
         is_LT_neg_infinite = (linear_terms == -Inf);
         is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one;
         finite_linear_terms = replace (target =        linear_terms, pattern =  Inf, replacement = 0);
         finite_linear_terms = replace (target = finite_linear_terms, pattern = -Inf, replacement = 0);
         if (link_type == 2)             { # Binomial.logit
           Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one;
           Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
         } else { if (link_type == 3)    { # Binomial.probit
           lt_pos_neg = (finite_linear_terms >= 0) %*% p_one_m_one + ones_r %*% zero_one;
           t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 0.231641888 = 0.3275911 / sqrt (2.0)
           pt_gp = t_gp * ( 0.254829592
                            + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun,
                                      + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
                                                 + t_gp * (-1.453152027
                                                           + t_gp *   1.061405429))));
           the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
           Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg);
         } else { if (link_type == 4)    { # Binomial.cloglog
           the_exp = exp (finite_linear_terms);
           the_exp_exp = exp (- the_exp);
           is_too_small = ((10000000 + the_exp) == 10000000);
           Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2);
           Y_prob [, 2] = the_exp_exp;
         } else { if (link_type == 5)    { # Binomial.cauchit
           Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / pi;
         } else {
           isNaN = 1;
         }}}}
         Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite;
       }   }


 # THE CG-STEIHAUG PROCEDURE SCRIPT

 # Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize
 # 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
 # under constraint:  ||z|| <= trust_delta.
 # See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright
 # IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform
 # is given separately because sparse "X" may become dense after applying the transform.
 #
 get_CG_Steihaug_point =
   function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w,
             Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG)
     return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary)
 {
       trust_delta_sq = trust_delta ^ 2;
       size_CG = nrow (g);
       z = matrix (0.0, rows = size_CG, cols = 1);
       neg_log_l_change = 0.0;
       reached_trust_boundary = 0;
       g_reg = g + lambda * beta;
       r_CG = g_reg;
       p_CG = -r_CG;
       rr_CG = sum(r_CG * r_CG);
       eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
       converged_CG = 0;
       if (rr_CG < eps_CG) {
         converged_CG = 1;
       }

       max_iteration_CG = max_iter_CG;
       if (max_iteration_CG <= 0) {
         max_iteration_CG = size_CG;
       }
       i_CG = 0;
       while (converged_CG == 0)
       {
         i_CG = i_CG + 1;
         ssX_p_CG = diag (scale_X) %*% p_CG;
         ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
         temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
         q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ];
         pq_CG = sum (p_CG * q_CG);
         if (pq_CG <= 0) {
           pp_CG = sum (p_CG * p_CG);
           if (pp_CG > 0) {
             [z, neg_log_l_change] =
               get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
             reached_trust_boundary = 1;
           } else {
             neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
           }
           converged_CG = 1;
         }
         if (converged_CG == 0) {
           alpha_CG = rr_CG / pq_CG;
           new_z = z + alpha_CG * p_CG;
           if (sum(new_z * new_z) >= trust_delta_sq) {
             pp_CG = sum (p_CG * p_CG);
             [z, neg_log_l_change] =
               get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq);
             reached_trust_boundary = 1;
             converged_CG = 1;
           }
           if (converged_CG == 0) {
             z = new_z;
             old_rr_CG = rr_CG;
             r_CG = r_CG + alpha_CG * q_CG;
             rr_CG = sum(r_CG * r_CG);
             if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
               neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
               reached_trust_boundary = 0;
               converged_CG = 1;
             }
             if (converged_CG == 0) {
               p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
             }   }   }   }   }


 # An auxiliary function used twice inside the CG-STEIHAUG loop:
 get_trust_boundary_point =
   function (Matrix[double] g, Matrix[double] z, Matrix[double] p,
             Matrix[double] q, Matrix[double] r, double pp, double pq,
             double trust_delta_sq)
     return (Matrix[double] new_z, double f_change)
 {
       zz = sum (z * z);  pz = sum (p * z);
       sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
       tau_1 = (- pz + sq_root_d) / pp;
       tau_2 = (- pz - sq_root_d) / pp;
       zq = sum (z * q);  gp = sum (g * p);
       f_extra = 0.5 * sum (z * (r + g));
       f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
       f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
       ind1 = as.integer(f_change_1 < f_change_2);
       ind2 = as.integer(f_change_1 >= f_change_2);
       new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
       f_change = ind1 * f_change_1 + ind2 * f_change_2;
 }


 # Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
 # We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
 # it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
 straightenX =
   function (Matrix[double] X, double eps, int max_iter_CG)
     return   (Matrix[double] w)
 {
       w_X = t(colSums(X));
       lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
       eps_LS = eps * nrow(X);

       # BEGIN LEAST SQUARES

       r_LS = - w_X;
       z_LS = matrix (0.0, rows = ncol(X), cols = 1);
       p_LS = - r_LS;
       norm_r2_LS = sum (r_LS ^ 2);
       i_LS = 0;
       while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
       {
         q_LS = t(X) %*% X %*% p_LS;
         q_LS = q_LS + lambda_LS * p_LS;
         alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
         z_LS = z_LS + alpha_LS * p_LS;
         old_norm_r2_LS = norm_r2_LS;
         r_LS = r_LS + alpha_LS * q_LS;
         norm_r2_LS = sum (r_LS ^ 2);
         p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
         i_LS = i_LS + 1;
       }

       # END LEAST SQUARES

       w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
     }