scripts/algorithms/StepLinearRegDS.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
 # EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME        TYPE    DEFAULT    MEANING
 # --------------------------------------------------------------------------------------------
 # X           String     ---   Location (on HDFS) to read the matrix X of feature vectors
 # Y           String     ---   Location (on HDFS) to read the 1-column matrix Y of response values
 # B           String     ---   Location to store estimated regression parameters (the betas)
 # S           String     ---   Location to write the selected features ordered as computed by the algorithm
 # O           String     " "   Location to write the printed statistics; by default is standard output
 # icpt        Int        0     Intercept presence, shifting and rescaling the columns of X:
 #                              0 = no intercept, no shifting, no rescaling;
 #                              1 = add intercept, but neither shift nor rescale X;
 #                              2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # thr         Double    0.01   Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
 #                              no further features are being checked and the algorithm stops
 # fmt         String   "text"  Matrix output format for B (the betas) only, usually "text" or "csv"
 # write_beta  Boolean   TRUE   Should the beta's be returned?
 #                              0 = no
 #                              1 = yes
 # --------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
 #         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
 # icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
 # icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 # icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 #                        Col.2: betas for shifted/rescaled X and intercept
 #
 # In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
 # name-value pair per each line, as follows:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------
 # AVG_TOT_Y             Average of the response value Y
 # STDEV_TOT_Y           Standard Deviation of the response value Y
 # AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
 # STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
 # DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
 # R2                    R^2 of residual with bias included vs. total average
 # ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
 # R2_NOBIAS             R^2 of residual with bias subtracted vs. total average
 # ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
 # R2_VS_0               * R^2 of residual with bias included vs. zero constant
 # ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
 # -------------------------------------------------------------------------------------
 # * The last two statistics are only printed if there is no intercept (icpt=0)
 # If the best AIC is achieved without any features the matrix of selected features contains 0.
 # Moreover, in this case no further statistics will be produced
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
 #     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE

 fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileS = $S;

 write_beta = ifdef($write_beta, TRUE);

 # currently only the forward selection strategy in supported: start from one feature and iteratively add
 # features until AIC improves
 dir = "forward";

 fmt  = ifdef ($fmt, "text");
 intercept_status = ifdef ($icpt, 1);
 thr = ifdef ($thr, 0.001);

 print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
 print ("Reading X and Y...");
 X_orig = read (fileX);
 y = read (fileY);

 n = nrow (X_orig);
 m_orig = ncol (X_orig);

 # BEGIN STEPWISE LINEAR REGRESSION

 if (dir == "forward") {
   continue = TRUE;
   columns_fixed = matrix (0, rows = 1, cols = m_orig);
   columns_fixed_ordered = matrix (0, rows = 1, cols = 1);

   # X_global stores the best model found at each step
   X_global = matrix (0, rows = n, cols = 1);

   if (intercept_status == 1 | intercept_status == 2) {
     beta = mean (y);
     AIC_best = 2 + n * log(sum((beta - y)^2) / n);
   } else {
     beta = 0.0;
     AIC_best = n * log(sum(y^2) / n);
   }

   AICs = matrix (AIC_best, rows = 1, cols = m_orig);
   print ("Best AIC without any features: " + AIC_best);

   boa_ncol = ncol(X_orig)
   if (intercept_status != 0) {
     boa_ncol = boa_ncol + 1
   }

   beta_out_all = matrix(0, rows = boa_ncol, cols = m_orig * 1);

   y_ncol = 1;

   # First pass to examine single features
   parfor (i in 1:m_orig, check = 0) {
     columns_fixed_ordered_1 = matrix(i, rows=1, cols=1);

     [AIC_1, beta_out_i] = linear_regression (X_orig[, i], y, m_orig, columns_fixed_ordered_1,
                                              write_beta, 0);

     AICs[1, i] = AIC_1;

     beta_out_all[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[, 1:1];

   }

   # Determine the best AIC
   column_best = 0;
   for (k in 1:m_orig) {
     AIC_cur = as.scalar (AICs[1, k]);
     if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
       column_best = k;
       AIC_best = as.scalar(AICs[1, k]);
     }
   }

   # beta best so far
   beta_best = beta_out_all[, (column_best-1) * y_ncol + 1: column_best * y_ncol];

   if (column_best == 0) {
     print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " +
            (thr * 100) + "% decrease in AIC!");
     Selected = matrix (0, rows = 1, cols = 1);
     if (intercept_status == 0) {
       B = matrix (beta, rows = m_orig, cols = 1);
     } else {
       B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
       B_tmp[m_orig + 1, ] = beta;
       B = B_tmp;
     }

     beta_out = B;

     write(Selected, fileS, format=fmt);
     write(beta_out, fileB, format=fmt);

     stop ("");
   }
   print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
   columns_fixed[1, column_best] = 1;
   columns_fixed_ordered[1, 1] = column_best;
   X_global = X_orig[, column_best];

     while (continue) {
     # Subsequent passes over the features
     beta_out_all_2 = matrix(0, rows = boa_ncol, cols = m_orig * 1);

     parfor (i in 1:m_orig, check = 0) {
       if (as.scalar(columns_fixed[1, i]) == 0) {

         # Construct the feature matrix
         X = cbind (X_global, X_orig[, i]);

         tmp = matrix(0, rows=1, cols=1);
         tmp[1, 1] = i;
         columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp )
         [AIC_2, beta_out_i] = linear_regression (X, y, m_orig, columns_fixed_ordered_2, write_beta, 0);
         beta_out_all_2[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[,1:1];

         AICs[1, i] = AIC_2;
       }
     }

     # Determine the best AIC
     for (k in 1:m_orig) {
       AIC_cur = as.scalar (AICs[1, k]);
       if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) &
             (as.scalar(columns_fixed[1, k]) == 0) ) {
         column_best = k;
         AIC_best = as.scalar(AICs[1, k]);
       }
     }

     # have the best beta store in the matrix
     beta_best = beta_out_all_2[, (column_best - 1) * y_ncol + 1 : column_best * y_ncol];

     # Append best found features (i.e., columns) to X_global
     if (as.scalar(columns_fixed[1, column_best]) == 0) { # new best feature found
       print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
       columns_fixed[1, column_best] = 1;
       columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best));

       if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
         X_global = cbind (X_global, X_orig[, column_best]);
         continue = FALSE;
       } else {
         X_global = cbind (X_global, X_orig[, column_best]);
       }
     } else {
       continue = FALSE;
     }

   }

   # run linear regression with selected set of features
   print ("Running linear regression with selected features...");
   [AIC, beta_out] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, write_beta, 1);

   Selected = columns_fixed_ordered;
   if (intercept_status != 0) {
     Selected = cbind(Selected, matrix(boa_ncol, rows=1, cols=1))
   }

   beta_out = reorder_matrix(boa_ncol, beta_out, Selected);

   write(Selected, fileS, format=fmt);
   write(beta_out, fileB, format=fmt);

 } else {
   stop ("Currently only forward selection strategy is supported!");
 }

 # Computes linear regression using a direct solver for (X^T X) beta = X^T y.
 # It also outputs the AIC of the computed model.

 linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig,
   Matrix[Double] Selected, Boolean write_beta, Boolean writeStats)
   return (Double AIC, Matrix[Double] beta) {

     intercept_status = ifdef ($icpt, 0);
     fmt = ifdef ($fmt, "text");
     n = nrow (X);
     m = ncol (X);

     # Introduce the intercept, shift and rescale the columns of X if needed
     if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
       ones_n = matrix (1, rows = n, cols = 1);
       X = cbind (X, ones_n);
       m = m - 1;
     }

     m_ext = ncol(X);

     if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
       # Important assumption: X [, m_ext] = ones_n
       avg_X_cols = t(colSums(X)) / n;
       var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
       is_unsafe = (var_X_cols <= 0);
       scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
       scale_X [m_ext, 1] = 1;
       shift_X = - avg_X_cols * scale_X;
       shift_X [m_ext, 1] = 0;
     } else {
       scale_X = matrix (1, rows = m_ext, cols = 1);
       shift_X = matrix (0, rows = m_ext, cols = 1);
     }

     # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

     A = t(X) %*% X;
     b = t(X) %*% y;
     if (intercept_status == 2) {
       A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
       A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
       b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
     }

     beta_unscaled = solve (A, b);

     # END THE DIRECT SOLVE ALGORITHM

     if (intercept_status == 2) {
       beta = scale_X * beta_unscaled;
       beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
     } else {
       beta = beta_unscaled;
     }

     # COMPUTE AIC
     y_residual = y - X %*% beta;
     ss_res = sum (y_residual ^ 2);
     eq_deg_of_freedom = m_ext;
     AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);

     if(write_beta == 1) {
       fileO = ifdef ($O, " ");
       fileS = $S;

       print ("Computing the statistics...");
       avg_tot = sum (y) / n;
       ss_tot = sum (y ^ 2);
       ss_avg_tot = ss_tot - n * avg_tot ^ 2;
       var_tot = ss_avg_tot / (n - 1);
       # y_residual = y - X %*% beta;
       avg_res = sum (y_residual) / n;
       # ss_res = sum (y_residual ^ 2);
       ss_avg_res = ss_res - n * avg_res ^ 2;

       R2 = 1 - ss_res / ss_avg_tot;
       if (n > m_ext) {
         dispersion  = ss_res / (n - m_ext);
         adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
       } else {
         dispersion  = 0.0 / 0.0;
         adjusted_R2 = 0.0 / 0.0;
       }

       R2_nobias = 1 - ss_avg_res / ss_avg_tot;
       deg_freedom = n - m - 1;
       if (deg_freedom > 0) {
         var_res = ss_avg_res / deg_freedom;
         adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
       } else {
         var_res = 0.0 / 0.0;
         adjusted_R2_nobias = 0.0 / 0.0;
         print ("Warning: zero or negative number of degrees of freedom.");
       }

       R2_vs_0 = 1 - ss_res / ss_tot;
       if (n > m) {
         adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
       } else {
         adjusted_R2_vs_0 = 0.0 / 0.0;
       }

       str = "AVG_TOT_Y," + avg_tot;                                    #  Average of the response value Y
       str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard Deviation of the response value Y
       str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias
       str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)
       str = append (str, "DISPERSION," + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
       str = append (str, "R2," + R2);                                  #  R^2 of residual with bias included vs. total average
       str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average
       str = append (str, "R2_NOBIAS," + R2_nobias);                    #  R^2 of residual with bias subtracted vs. total average
       str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average
       if (intercept_status == 0) {
         str = append (str, "R2_VS_0," + R2_vs_0);                      #  R^2 of residual with bias included vs. zero constant
         str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);    #  Adjusted R^2 of residual with bias included vs. zero constant
       }

       if (fileO != " " & writeStats != 0) {
         write(str, fileO);
       } else {
         print (str);
         print ("");
       }

       # TODO IMP NOTE: with the fix in PR-22, we have not accounted for
       # intercept=2 and # the code before # was not matching so we have removed it
       # for now. Pl see the git revision history and diff to see the changes.
       # in future we will have this feature. For now it is disabled
     }
   }


 reorder_matrix = function(
   double ncolX, # number of column in X, inlcuding the intercept column
   matrix[double] B, # beta
   matrix[double] S  # Selected
 ) return (matrix[double] Y) {
   # This function assumes that B and S have same number of elements.
   # if the intercept is included in the model, all inputs should be adjusted
   # appropriately before calling this function.

   S = t(S);
   num_empty_B = ncolX - nrow(B);
   if (num_empty_B < 0) {
     stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
   }

   if (num_empty_B > 0) {
     pad_zeros = matrix(0, rows = num_empty_B, cols=1);
     B = rbind(B, pad_zeros);
     S = rbind(S, pad_zeros);
   }

   # since the table won't accept zeros as index we hack it.
   S0 = replace(target = S, pattern = 0, replacement = ncolX+1);
   seqS = seq(1, nrow(S0));
   P = table(seqS, S0, ncolX, ncolX);

   Y = t(P) %*% B;
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
	# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X String --- Location (on HDFS) to read the matrix X of feature vectors
	# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
	# B String --- Location to store estimated regression parameters (the betas)
	# S String --- Location to write the selected features ordered as computed by the algorithm
	# O String " " Location to write the printed statistics; by default is standard output
	# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
	# 0 = no intercept, no shifting, no rescaling;
	# 1 = add intercept, but neither shift nor rescale X;
	# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
	# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
	# no further features are being checked and the algorithm stops
	# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
	# write_beta Boolean TRUE Should the beta's be returned?
	# 0 = no
	# 1 = yes
	# --------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
	# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
	# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %% B[1:ncol(X), 1], or just X %% B
	# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# Col.2: betas for shifted/rescaled X and intercept
	#
	# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
	# name-value pair per each line, as follows:
	#
	# NAME MEANING
	# -------------------------------------------------------------------------------------
	# AVG_TOT_Y Average of the response value Y
	# STDEV_TOT_Y Standard Deviation of the response value Y
	# AVG_RES_Y Average of the residual Y - pred(Y\|X), i.e. residual bias
	# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y\|X)
	# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
	# R2 R^2 of residual with bias included vs. total average
	# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
	# R2_NOBIAS R^2 of residual with bias subtracted vs. total average
	# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
	# R2_VS_0 * R^2 of residual with bias included vs. zero constant
	# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
	# -------------------------------------------------------------------------------------
	# * The last two statistics are only printed if there is no intercept (icpt=0)
	# If the best AIC is achieved without any features the matrix of selected features contains 0.
	# Moreover, in this case no further statistics will be produced
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
	# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE

	fileX = $X;
	fileY = $Y;
	fileB = $B;
	fileS = $S;

	write_beta = ifdef($write_beta, TRUE);

	# currently only the forward selection strategy in supported: start from one feature and iteratively add
	# features until AIC improves
	dir = "forward";

	fmt = ifdef ($fmt, "text");
	intercept_status = ifdef ($icpt, 1);
	thr = ifdef ($thr, 0.001);

	print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
	print ("Reading X and Y...");
	X_orig = read (fileX);
	y = read (fileY);

	n = nrow (X_orig);
	m_orig = ncol (X_orig);

	# BEGIN STEPWISE LINEAR REGRESSION

	if (dir == "forward") {
	continue = TRUE;
	columns_fixed = matrix (0, rows = 1, cols = m_orig);
	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);

	# X_global stores the best model found at each step
	X_global = matrix (0, rows = n, cols = 1);

	if (intercept_status == 1 \| intercept_status == 2) {
	beta = mean (y);
	AIC_best = 2 + n * log(sum((beta - y)^2) / n);
	} else {
	beta = 0.0;
	AIC_best = n * log(sum(y^2) / n);
	}

	AICs = matrix (AIC_best, rows = 1, cols = m_orig);
	print ("Best AIC without any features: " + AIC_best);

	boa_ncol = ncol(X_orig)
	if (intercept_status != 0) {
	boa_ncol = boa_ncol + 1
	}

	beta_out_all = matrix(0, rows = boa_ncol, cols = m_orig * 1);

	y_ncol = 1;

	# First pass to examine single features
	parfor (i in 1:m_orig, check = 0) {
	columns_fixed_ordered_1 = matrix(i, rows=1, cols=1);

	[AIC_1, beta_out_i] = linear_regression (X_orig[, i], y, m_orig, columns_fixed_ordered_1,
	write_beta, 0);

	AICs[1, i] = AIC_1;

	beta_out_all[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[, 1:1];

	}

	# Determine the best AIC
	column_best = 0;
	for (k in 1:m_orig) {
	AIC_cur = as.scalar (AICs[1, k]);
	if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
	column_best = k;
	AIC_best = as.scalar(AICs[1, k]);
	}
	}

	# beta best so far
	beta_best = beta_out_all[, (column_best-1) * y_ncol + 1: column_best * y_ncol];

	if (column_best == 0) {
	print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " +
	(thr * 100) + "% decrease in AIC!");
	Selected = matrix (0, rows = 1, cols = 1);
	if (intercept_status == 0) {
	B = matrix (beta, rows = m_orig, cols = 1);
	} else {
	B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
	B_tmp[m_orig + 1, ] = beta;
	B = B_tmp;
	}

	beta_out = B;

	write(Selected, fileS, format=fmt);
	write(beta_out, fileB, format=fmt);

	stop ("");
	}
	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
	columns_fixed[1, column_best] = 1;
	columns_fixed_ordered[1, 1] = column_best;
	X_global = X_orig[, column_best];

	while (continue) {
	# Subsequent passes over the features
	beta_out_all_2 = matrix(0, rows = boa_ncol, cols = m_orig * 1);

	parfor (i in 1:m_orig, check = 0) {
	if (as.scalar(columns_fixed[1, i]) == 0) {

	# Construct the feature matrix
	X = cbind (X_global, X_orig[, i]);

	tmp = matrix(0, rows=1, cols=1);
	tmp[1, 1] = i;
	columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp )
	[AIC_2, beta_out_i] = linear_regression (X, y, m_orig, columns_fixed_ordered_2, write_beta, 0);
	beta_out_all_2[1:nrow(beta_out_i), (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[,1:1];

	AICs[1, i] = AIC_2;
	}
	}

	# Determine the best AIC
	for (k in 1:m_orig) {
	AIC_cur = as.scalar (AICs[1, k]);
	if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) &
	(as.scalar(columns_fixed[1, k]) == 0) ) {
	column_best = k;
	AIC_best = as.scalar(AICs[1, k]);
	}
	}

	# have the best beta store in the matrix
	beta_best = beta_out_all_2[, (column_best - 1) * y_ncol + 1 : column_best * y_ncol];

	# Append best found features (i.e., columns) to X_global
	if (as.scalar(columns_fixed[1, column_best]) == 0) { # new best feature found
	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
	columns_fixed[1, column_best] = 1;
	columns_fixed_ordered = cbind (columns_fixed_ordered, as.matrix(column_best));

	if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
	X_global = cbind (X_global, X_orig[, column_best]);
	continue = FALSE;
	} else {
	X_global = cbind (X_global, X_orig[, column_best]);
	}
	} else {
	continue = FALSE;
	}

	}

	# run linear regression with selected set of features
	print ("Running linear regression with selected features...");
	[AIC, beta_out] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, write_beta, 1);

	Selected = columns_fixed_ordered;
	if (intercept_status != 0) {
	Selected = cbind(Selected, matrix(boa_ncol, rows=1, cols=1))
	}

	beta_out = reorder_matrix(boa_ncol, beta_out, Selected);

	write(Selected, fileS, format=fmt);
	write(beta_out, fileB, format=fmt);

	} else {
	stop ("Currently only forward selection strategy is supported!");
	}

	# Computes linear regression using a direct solver for (X^T X) beta = X^T y.
	# It also outputs the AIC of the computed model.

	linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig,
	Matrix[Double] Selected, Boolean write_beta, Boolean writeStats)
	return (Double AIC, Matrix[Double] beta) {

	intercept_status = ifdef ($icpt, 0);
	fmt = ifdef ($fmt, "text");
	n = nrow (X);
	m = ncol (X);

	# Introduce the intercept, shift and rescale the columns of X if needed
	if (intercept_status == 1 \| intercept_status == 2) { # add the intercept column
	ones_n = matrix (1, rows = n, cols = 1);
	X = cbind (X, ones_n);
	m = m - 1;
	}

	m_ext = ncol(X);

	if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
	# Important assumption: X [, m_ext] = ones_n
	avg_X_cols = t(colSums(X)) / n;
	var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
	is_unsafe = (var_X_cols <= 0);
	scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
	scale_X [m_ext, 1] = 1;
	shift_X = - avg_X_cols * scale_X;
	shift_X [m_ext, 1] = 0;
	} else {
	scale_X = matrix (1, rows = m_ext, cols = 1);
	shift_X = matrix (0, rows = m_ext, cols = 1);
	}

	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

	A = t(X) %*% X;
	b = t(X) %*% y;
	if (intercept_status == 2) {
	A = t(diag (scale_X) %% A + shift_X %% A [m_ext, ]);
	A = diag (scale_X) %% A + shift_X %% A [m_ext, ];
	b = diag (scale_X) %% b + shift_X %% b [m_ext, ];
	}

	beta_unscaled = solve (A, b);

	# END THE DIRECT SOLVE ALGORITHM

	if (intercept_status == 2) {
	beta = scale_X * beta_unscaled;
	beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
	} else {
	beta = beta_unscaled;
	}

	# COMPUTE AIC
	y_residual = y - X %*% beta;
	ss_res = sum (y_residual ^ 2);
	eq_deg_of_freedom = m_ext;
	AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);

	if(write_beta == 1) {
	fileO = ifdef ($O, " ");
	fileS = $S;

	print ("Computing the statistics...");
	avg_tot = sum (y) / n;
	ss_tot = sum (y ^ 2);
	ss_avg_tot = ss_tot - n * avg_tot ^ 2;
	var_tot = ss_avg_tot / (n - 1);
	# y_residual = y - X %*% beta;
	avg_res = sum (y_residual) / n;
	# ss_res = sum (y_residual ^ 2);
	ss_avg_res = ss_res - n * avg_res ^ 2;

	R2 = 1 - ss_res / ss_avg_tot;
	if (n > m_ext) {
	dispersion = ss_res / (n - m_ext);
	adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
	} else {
	dispersion = 0.0 / 0.0;
	adjusted_R2 = 0.0 / 0.0;
	}

	R2_nobias = 1 - ss_avg_res / ss_avg_tot;
	deg_freedom = n - m - 1;
	if (deg_freedom > 0) {
	var_res = ss_avg_res / deg_freedom;
	adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
	} else {
	var_res = 0.0 / 0.0;
	adjusted_R2_nobias = 0.0 / 0.0;
	print ("Warning: zero or negative number of degrees of freedom.");
	}

	R2_vs_0 = 1 - ss_res / ss_tot;
	if (n > m) {
	adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
	} else {
	adjusted_R2_vs_0 = 0.0 / 0.0;
	}

	str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
	str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
	str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y\|X), i.e. residual bias
	str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y\|X)
	str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
	str = append (str, "R2," + R2); # R^2 of residual with bias included vs. total average
	str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
	str = append (str, "R2_NOBIAS," + R2_nobias); # R^2 of residual with bias subtracted vs. total average
	str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
	if (intercept_status == 0) {
	str = append (str, "R2_VS_0," + R2_vs_0); # R^2 of residual with bias included vs. zero constant
	str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
	}

	if (fileO != " " & writeStats != 0) {
	write(str, fileO);
	} else {
	print (str);
	print ("");
	}

	# TODO IMP NOTE: with the fix in PR-22, we have not accounted for
	# intercept=2 and # the code before # was not matching so we have removed it
	# for now. Pl see the git revision history and diff to see the changes.
	# in future we will have this feature. For now it is disabled
	}
	}


	reorder_matrix = function(
	double ncolX, # number of column in X, inlcuding the intercept column
	matrix[double] B, # beta
	matrix[double] S # Selected
	) return (matrix[double] Y) {
	# This function assumes that B and S have same number of elements.
	# if the intercept is included in the model, all inputs should be adjusted
	# appropriately before calling this function.

	S = t(S);
	num_empty_B = ncolX - nrow(B);
	if (num_empty_B < 0) {
	stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
	}

	if (num_empty_B > 0) {
	pad_zeros = matrix(0, rows = num_empty_B, cols=1);
	B = rbind(B, pad_zeros);
	S = rbind(S, pad_zeros);
	}

	# since the table won't accept zeros as index we hack it.
	S0 = replace(target = S, pattern = 0, replacement = ncolX+1);
	seqS = seq(1, nrow(S0));
	P = table(seqS, S0, ncolX, ncolX);

	Y = t(P) %*% B;
	}