scripts/algorithms/StepLinearRegDS.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
 # EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME    TYPE    DEFAULT    MEANING
 # --------------------------------------------------------------------------------------------
 # X       String   	---      Location (on HDFS) to read the matrix X of feature vectors
 # Y       String   	---      Location (on HDFS) to read the 1-column matrix Y of response values
 # B       String   	---      Location to store estimated regression parameters (the betas)
 # S       String    ---      Location to write the selected features ordered as computed by the algorithm
 # O       String   	" "      Location to write the printed statistics; by default is standard output
 # icpt    Int        0       Intercept presence, shifting and rescaling the columns of X:
 #                            0 = no intercept, no shifting, no rescaling;
 #                            1 = add intercept, but neither shift nor rescale X;
 #                            2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # thr     Double    0.01     Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
 #                            no further features are being checked and the algorithm stops
 # fmt     String   "text"    Matrix output format for B (the betas) only, usually "text" or "csv"
 # --------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
 #         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
 # icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
 # icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 # icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 #                        Col.2: betas for shifted/rescaled X and intercept
 #
 # In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
 # name-value pair per each line, as follows:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------
 # AVG_TOT_Y             Average of the response value Y
 # STDEV_TOT_Y           Standard Deviation of the response value Y
 # AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
 # STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
 # DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
 # PLAIN_R2              Plain R^2 of residual with bias included vs. total average
 # ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
 # PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total average
 # ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
 # PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero constant
 # ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
 # -------------------------------------------------------------------------------------
 # * The last two statistics are only printed if there is no intercept (icpt=0)
 # If the best AIC is achieved without any features the matrix of selected features contains 0.
 # Moreover, in this case no further statistics will be produced
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
 #     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv

 fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileS = $S;

 # currently only the forward selection strategy in supported: start from one feature and iteratively add
 # features until AIC improves
 dir = "forward";

 fmt  = ifdef ($fmt, "text");
 intercept_status = ifdef ($icpt, 0);
 thr = ifdef ($thr, 0.01);

 print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
 print ("Reading X and Y...");
 X_orig = read (fileX);
 y = read (fileY);

 n = nrow (X_orig);
 m_orig = ncol (X_orig);

 # BEGIN STEPWISE LINEAR REGRESSION

 if (dir == "forward") {

 	continue = TRUE;
 	columns_fixed = matrix (0, rows = 1, cols = m_orig);
 	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);

 	# X_global stores the best model found at each step
 	X_global = matrix (0, rows = n, cols = 1);

 	if (intercept_status == 1 | intercept_status == 2) {
 		beta = mean (y);
 		AIC_best = 2 + n * log(sum((beta - y)^2) / n);
 	} else {
 		beta = 0;
 		AIC_best = n * log(sum(y^2) / n);
 	}
 	AICs = matrix (AIC_best, rows = 1, cols = m_orig);
 	print ("Best AIC without any features: " + AIC_best);

 	# First pass to examine single features
 	parfor (i in 1:m_orig) {
 		[AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");
 		AICs[1,i] = AIC_1;
 	}

 	# Determine the best AIC
 	column_best = 0;
 	for (k in 1:m_orig) {
 		AIC_cur = as.scalar (AICs[1,k]);
 		if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
 			column_best = k;
 			AIC_best = as.scalar(AICs[1,k]);
 		}
 	}

 	if (column_best == 0) {
 		print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
 		S = matrix (0, rows=1, cols=1);
 		if (intercept_status == 0) {
 			B = matrix (beta, rows = m_orig, cols = 1);
 		} else {
 			B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
 			B_tmp[m_orig + 1,] = beta;
 			B = B_tmp;
 		}
 		write (S, fileS, format=fmt);
 		write (B, fileB, format=fmt);
 		stop ("");
 	}
 	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
 	columns_fixed[1,column_best] = 1;
 	columns_fixed_ordered[1,1] = column_best;
 	X_global = X_orig[,column_best];

 	while (continue) {
 		# Subsequent passes over the features
 		parfor (i in 1:m_orig) {
 			if (as.scalar(columns_fixed[1,i]) == 0) {

 				# Construct the feature matrix
 				X = append (X_global, X_orig[,i]);

 				[AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
 				AICs[1,i] = AIC_2;
 			}
 		}

 		# Determine the best AIC
 		for (k in 1:m_orig) {
 			AIC_cur = as.scalar (AICs[1,k]);
 			if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
 				column_best = k;
 				AIC_best = as.scalar(AICs[1,k]);
 			}
 		}

 		# Append best found features (i.e., columns) to X_global
 		if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
 			print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
 			columns_fixed[1,column_best] = 1;
 			columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
 			if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
 				X_global = append (X_global, X_orig[,column_best]);
 				continue = FALSE;
 			} else {
 				X_global = append (X_global, X_orig[,column_best]);
 			}
 		} else {
 			continue = FALSE;
 		}
 	}

 	# run linear regression with selected set of features
 	print ("Running linear regression with selected features...");
 	[AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB);

 } else {
 	stop ("Currently only forward selection strategy is supported!");
 }


 /*
 * Computes linear regression using a direct solver for (X^T X) beta = X^T y.
 * It also outputs the AIC of the computed model.
 */
 linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {

 	intercept_status = ifdef ($icpt, 0);
 	fmt  = ifdef ($fmt, "text");
 	n = nrow (X);
 	m = ncol (X);

 	# Introduce the intercept, shift and rescale the columns of X if needed
 	if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
 		ones_n = matrix (1, rows = n, cols = 1);
 		X = append (X, ones_n);
 		m = m - 1;
 	}
 	m_ext = ncol(X);

 	if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
 							     # Important assumption: X [, m_ext] = ones_n
 		avg_X_cols = t(colSums(X)) / n;
 		var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
 		is_unsafe = (var_X_cols <= 0);
 		scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
 		scale_X [m_ext, 1] = 1;
 		shift_X = - avg_X_cols * scale_X;
 		shift_X [m_ext, 1] = 0;
 	} else {
 		scale_X = matrix (1, rows = m_ext, cols = 1);
 		shift_X = matrix (0, rows = m_ext, cols = 1);
 	}

 	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

 	A = t(X) %*% X;
 	b = t(X) %*% y;
 	if (intercept_status == 2) {
 		A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
 		A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
 		b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
 	}

 	beta_unscaled = solve (A, b);

 	# END THE DIRECT SOLVE ALGORITHM

 	if (intercept_status == 2) {
 		beta = scale_X * beta_unscaled;
 		beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
 	} else {
 		beta = beta_unscaled;
 	}

 	# COMPUTE AIC
 	y_residual = y - X %*% beta;
 	ss_res = sum (y_residual ^ 2);
 	eq_deg_of_freedom = m_ext;
 	AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);

 	if (fileB != " ") {

 		fileO = ifdef ($O, " ");
 		fileS = $S;

 		print ("Computing the statistics...");
 		avg_tot = sum (y) / n;
 		ss_tot = sum (y ^ 2);
 		ss_avg_tot = ss_tot - n * avg_tot ^ 2;
 		var_tot = ss_avg_tot / (n - 1);
 #		y_residual = y - X %*% beta;
 		avg_res = sum (y_residual) / n;
 #		ss_res = sum (y_residual ^ 2);
 		ss_avg_res = ss_res - n * avg_res ^ 2;

 		plain_R2 = 1 - ss_res / ss_avg_tot;
 		if (n > m_ext) {
 			dispersion  = ss_res / (n - m_ext);
 			adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
 		} else {
 			dispersion  = 0.0 / 0.0;
 			adjusted_R2 = 0.0 / 0.0;
 		}

 		plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
 		deg_freedom = n - m - 1;
 		if (deg_freedom > 0) {
 			var_res = ss_avg_res / deg_freedom;
 			adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
 		} else {
 			var_res = 0.0 / 0.0;
 			adjusted_R2_nobias = 0.0 / 0.0;
 			print ("Warning: zero or negative number of degrees of freedom.");
 		}

 		plain_R2_vs_0 = 1 - ss_res / ss_tot;
 		if (n > m) {
 			adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
 		} else {
 			adjusted_R2_vs_0 = 0.0 / 0.0;
 		}

 		str = "AVG_TOT_Y," + avg_tot;                                    #  Average of the response value Y
 		str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard Deviation of the response value Y
 		str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias
 		str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)
 		str = append (str, "DISPERSION," + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
 		str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 of residual with bias included vs. total average
 		str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average
 		str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 of residual with bias subtracted vs. total average
 		str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average
 		if (intercept_status == 0) {
 			str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 of residual with bias included vs. zero constant
 			str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero constant
 		}

 		if (fileO != " ") {
 			write (str, fileO);
 		} else {
 			print (str);
 		}

 		# Prepare the output matrix
 		print ("Writing the output matrix...");
 		if (intercept_status == 2) {
 			beta_out = append (beta, beta_unscaled);
 		} else {
 			beta_out = beta;
 		}

 		# Output which features give the best AIC and are being used for linear regression
 		write (Selected, fileS, format=fmt);

 		no_selected = ncol (Selected);
 		max_selected = max (Selected);
 		last = max_selected + 1;

 		if (intercept_status != 0) {

 			Selected_ext = append (Selected, as.matrix (last));
 			P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));

 			if (intercept_status == 2) {

 				P1_beta = P1 * beta;
 				P2_beta = colSums (P1_beta);
 				P1_beta_unscaled = P1 * beta_unscaled;
 				P2_beta_unscaled = colSums(P1_beta_unscaled);

 				if (max_selected < m_orig) {
 					P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
 					P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));

 					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1];
 					P2_beta[1, max_selected + 1] = 0;

 					P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1];
 					P2_beta_unscaled[1, max_selected + 1] = 0;
 				}
 				beta_out = append (t(P2_beta), t(P2_beta_unscaled));

 			} else {

 				P1_beta = P1 * beta;
 				P2_beta = colSums (P1_beta);

 				if (max_selected < m_orig) {
 					P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
 					P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ;
 					P2_beta[1, max_selected + 1] = 0;
 				}
 				beta_out = t(P2_beta);

 			}
 		} else {

 			P1 = table (seq (1, no_selected), t(Selected));
 			P1_beta = P1 * beta;
 			P2_beta = colSums (P1_beta);

 			if (max_selected < m_orig) {
 				P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
 			}

 			beta_out = t(P2_beta);
 		}

 		write ( beta_out, fileB, format=fmt );
 	}
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
	# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X String --- Location (on HDFS) to read the matrix X of feature vectors
	# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
	# B String --- Location to store estimated regression parameters (the betas)
	# S String --- Location to write the selected features ordered as computed by the algorithm
	# O String " " Location to write the printed statistics; by default is standard output
	# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
	# 0 = no intercept, no shifting, no rescaling;
	# 1 = add intercept, but neither shift nor rescale X;
	# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
	# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
	# no further features are being checked and the algorithm stops
	# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
	# --------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
	# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
	# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %% B[1:ncol(X), 1], or just X %% B
	# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# Col.2: betas for shifted/rescaled X and intercept
	#
	# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
	# name-value pair per each line, as follows:
	#
	# NAME MEANING
	# -------------------------------------------------------------------------------------
	# AVG_TOT_Y Average of the response value Y
	# STDEV_TOT_Y Standard Deviation of the response value Y
	# AVG_RES_Y Average of the residual Y - pred(Y\|X), i.e. residual bias
	# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y\|X)
	# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
	# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
	# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
	# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
	# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
	# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
	# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
	# -------------------------------------------------------------------------------------
	# * The last two statistics are only printed if there is no intercept (icpt=0)
	# If the best AIC is achieved without any features the matrix of selected features contains 0.
	# Moreover, in this case no further statistics will be produced
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
	# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv

	fileX = $X;
	fileY = $Y;
	fileB = $B;
	fileS = $S;

	# currently only the forward selection strategy in supported: start from one feature and iteratively add
	# features until AIC improves
	dir = "forward";

	fmt = ifdef ($fmt, "text");
	intercept_status = ifdef ($icpt, 0);
	thr = ifdef ($thr, 0.01);

	print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
	print ("Reading X and Y...");
	X_orig = read (fileX);
	y = read (fileY);

	n = nrow (X_orig);
	m_orig = ncol (X_orig);

	# BEGIN STEPWISE LINEAR REGRESSION

	if (dir == "forward") {

	continue = TRUE;
	columns_fixed = matrix (0, rows = 1, cols = m_orig);
	columns_fixed_ordered = matrix (0, rows = 1, cols = 1);

	# X_global stores the best model found at each step
	X_global = matrix (0, rows = n, cols = 1);

	if (intercept_status == 1 \| intercept_status == 2) {
	beta = mean (y);
	AIC_best = 2 + n * log(sum((beta - y)^2) / n);
	} else {
	beta = 0;
	AIC_best = n * log(sum(y^2) / n);
	}
	AICs = matrix (AIC_best, rows = 1, cols = m_orig);
	print ("Best AIC without any features: " + AIC_best);

	# First pass to examine single features
	parfor (i in 1:m_orig) {
	[AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");
	AICs[1,i] = AIC_1;
	}

	# Determine the best AIC
	column_best = 0;
	for (k in 1:m_orig) {
	AIC_cur = as.scalar (AICs[1,k]);
	if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
	column_best = k;
	AIC_best = as.scalar(AICs[1,k]);
	}
	}

	if (column_best == 0) {
	print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
	S = matrix (0, rows=1, cols=1);
	if (intercept_status == 0) {
	B = matrix (beta, rows = m_orig, cols = 1);
	} else {
	B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
	B_tmp[m_orig + 1,] = beta;
	B = B_tmp;
	}
	write (S, fileS, format=fmt);
	write (B, fileB, format=fmt);
	stop ("");
	}
	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
	columns_fixed[1,column_best] = 1;
	columns_fixed_ordered[1,1] = column_best;
	X_global = X_orig[,column_best];

	while (continue) {
	# Subsequent passes over the features
	parfor (i in 1:m_orig) {
	if (as.scalar(columns_fixed[1,i]) == 0) {

	# Construct the feature matrix
	X = append (X_global, X_orig[,i]);

	[AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
	AICs[1,i] = AIC_2;
	}
	}

	# Determine the best AIC
	for (k in 1:m_orig) {
	AIC_cur = as.scalar (AICs[1,k]);
	if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
	column_best = k;
	AIC_best = as.scalar(AICs[1,k]);
	}
	}

	# Append best found features (i.e., columns) to X_global
	if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
	print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
	columns_fixed[1,column_best] = 1;
	columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
	if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
	X_global = append (X_global, X_orig[,column_best]);
	continue = FALSE;
	} else {
	X_global = append (X_global, X_orig[,column_best]);
	}
	} else {
	continue = FALSE;
	}
	}

	# run linear regression with selected set of features
	print ("Running linear regression with selected features...");
	[AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB);

	} else {
	stop ("Currently only forward selection strategy is supported!");
	}


	/*
	* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
	* It also outputs the AIC of the computed model.
	*/
	linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {

	intercept_status = ifdef ($icpt, 0);
	fmt = ifdef ($fmt, "text");
	n = nrow (X);
	m = ncol (X);

	# Introduce the intercept, shift and rescale the columns of X if needed
	if (intercept_status == 1 \| intercept_status == 2) { # add the intercept column
	ones_n = matrix (1, rows = n, cols = 1);
	X = append (X, ones_n);
	m = m - 1;
	}
	m_ext = ncol(X);

	if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
	# Important assumption: X [, m_ext] = ones_n
	avg_X_cols = t(colSums(X)) / n;
	var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
	is_unsafe = (var_X_cols <= 0);
	scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
	scale_X [m_ext, 1] = 1;
	shift_X = - avg_X_cols * scale_X;
	shift_X [m_ext, 1] = 0;
	} else {
	scale_X = matrix (1, rows = m_ext, cols = 1);
	shift_X = matrix (0, rows = m_ext, cols = 1);
	}

	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

	A = t(X) %*% X;
	b = t(X) %*% y;
	if (intercept_status == 2) {
	A = t(diag (scale_X) %% A + shift_X %% A [m_ext, ]);
	A = diag (scale_X) %% A + shift_X %% A [m_ext, ];
	b = diag (scale_X) %% b + shift_X %% b [m_ext, ];
	}

	beta_unscaled = solve (A, b);

	# END THE DIRECT SOLVE ALGORITHM

	if (intercept_status == 2) {
	beta = scale_X * beta_unscaled;
	beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
	} else {
	beta = beta_unscaled;
	}

	# COMPUTE AIC
	y_residual = y - X %*% beta;
	ss_res = sum (y_residual ^ 2);
	eq_deg_of_freedom = m_ext;
	AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);

	if (fileB != " ") {

	fileO = ifdef ($O, " ");
	fileS = $S;

	print ("Computing the statistics...");
	avg_tot = sum (y) / n;
	ss_tot = sum (y ^ 2);
	ss_avg_tot = ss_tot - n * avg_tot ^ 2;
	var_tot = ss_avg_tot / (n - 1);
	# y_residual = y - X %*% beta;
	avg_res = sum (y_residual) / n;
	# ss_res = sum (y_residual ^ 2);
	ss_avg_res = ss_res - n * avg_res ^ 2;

	plain_R2 = 1 - ss_res / ss_avg_tot;
	if (n > m_ext) {
	dispersion = ss_res / (n - m_ext);
	adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
	} else {
	dispersion = 0.0 / 0.0;
	adjusted_R2 = 0.0 / 0.0;
	}

	plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
	deg_freedom = n - m - 1;
	if (deg_freedom > 0) {
	var_res = ss_avg_res / deg_freedom;
	adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
	} else {
	var_res = 0.0 / 0.0;
	adjusted_R2_nobias = 0.0 / 0.0;
	print ("Warning: zero or negative number of degrees of freedom.");
	}

	plain_R2_vs_0 = 1 - ss_res / ss_tot;
	if (n > m) {
	adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
	} else {
	adjusted_R2_vs_0 = 0.0 / 0.0;
	}

	str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
	str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
	str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y\|X), i.e. residual bias
	str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y\|X)
	str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
	str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
	str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
	str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
	str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
	if (intercept_status == 0) {
	str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
	str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
	}

	if (fileO != " ") {
	write (str, fileO);
	} else {
	print (str);
	}

	# Prepare the output matrix
	print ("Writing the output matrix...");
	if (intercept_status == 2) {
	beta_out = append (beta, beta_unscaled);
	} else {
	beta_out = beta;
	}

	# Output which features give the best AIC and are being used for linear regression
	write (Selected, fileS, format=fmt);

	no_selected = ncol (Selected);
	max_selected = max (Selected);
	last = max_selected + 1;

	if (intercept_status != 0) {

	Selected_ext = append (Selected, as.matrix (last));
	P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));

	if (intercept_status == 2) {

	P1_beta = P1 * beta;
	P2_beta = colSums (P1_beta);
	P1_beta_unscaled = P1 * beta_unscaled;
	P2_beta_unscaled = colSums(P1_beta_unscaled);

	if (max_selected < m_orig) {
	P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
	P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));

	P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1];
	P2_beta[1, max_selected + 1] = 0;

	P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1];
	P2_beta_unscaled[1, max_selected + 1] = 0;
	}
	beta_out = append (t(P2_beta), t(P2_beta_unscaled));

	} else {

	P1_beta = P1 * beta;
	P2_beta = colSums (P1_beta);

	if (max_selected < m_orig) {
	P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
	P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ;
	P2_beta[1, max_selected + 1] = 0;
	}
	beta_out = t(P2_beta);

	}
	} else {

	P1 = table (seq (1, no_selected), t(Selected));
	P1_beta = P1 * beta;
	P2_beta = colSums (P1_beta);

	if (max_selected < m_orig) {
	P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
	}

	beta_out = t(P2_beta);
	}

	write ( beta_out, fileB, format=fmt );
	}
	}