scripts/builtin/steplm.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------


 #
 # THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
 # EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME        TYPE    DEFAULT    MEANING
 # --------------------------------------------------------------------------------------------
 # X           String     ---   Location (on HDFS) to read the matrix X of feature vectors
 # Y           String     ---   Location (on HDFS) to read the 1-column matrix Y of response values
 # B           String     ---   Location to store estimated regression parameters (the betas)
 # S           String     ---   Location to write the selected features ordered as computed by the algorithm
 # O           String     " "   Location to write the printed statistics; by default is standard output
 # icpt        Int        0     Intercept presence, shifting and rescaling the columns of X:
 #                              0 = no intercept, no shifting, no rescaling;
 #                              1 = add intercept, but neither shift nor rescale X;
 #                              2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # thr         Double    0.01   Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
 #                              no further features are being checked and the algorithm stops
 # fmt         String   "text"  Matrix output format for B (the betas) only, usually "text" or "csv"
 # write_beta  Boolean   TRUE   Should the beta's be returned?
 #                              0 = no
 #                              1 = yes
 # --------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
 #         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
 # icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
 # icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 # icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 #                        Col.2: betas for shifted/rescaled X and intercept
 #
 # In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
 # name-value pair per each line, as follows:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------
 # AVG_TOT_Y             Average of the response value Y
 # STDEV_TOT_Y           Standard Deviation of the response value Y
 # AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias

 m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
   Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
   return(Matrix[Double] B, Matrix[Double] S)
 {
   if( icpt!=0 & icpt!=1 & icpt!=2 )
     stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 0,1,2).");

   # NOTE: currently only the forward selection strategy in supported:
   # start from one feature and iteratively add features until AIC improves
   thr = 0.001;

 	if(verbose)
     print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
   X_orig = X;
   n = nrow(X_orig);
   m_orig = ncol(X_orig);

   # BEGIN STEPWISE LINEAR REGRESSION
   columns_fixed = matrix(0, 1, m_orig);
   columns_fixed_ordered = matrix(0, 1, 1);

   # X_global stores the best model found at each step
   X_global = matrix(0, n, 1);

   if (icpt == 1 | icpt == 2) {
     beta = mean(y);
     AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
   } else {
     beta = 0.0;
     AIC_best_orig = n * log(sum(y ^ 2) / n);
   }
   if(verbose)
 	  print("Best AIC without any features: " + AIC_best_orig);
   boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
   beta_out_all = matrix(0, boa_ncol, m_orig);

   # First pass to examine single features
   AICs = matrix(0, 1, m_orig);
   parfor (i in 1:m_orig) {
     [AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, maxi, verbose);
     AICs[1, i] = AIC_1;
     beta_out_all[1:nrow(beta_out_i), i] = beta_out_i;
   }
   AIC_best = min(min(AICs), AIC_best_orig);
   AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
   column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0);

   # beta best so far
   beta_best = beta_out_all[, column_best];
   if (column_best == 0) {
     if(verbose)
 		  print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
     B = matrix(0, m_orig, 1);
     if (icpt != 0)
       B = rbind(B, as.matrix(beta));
     S = matrix(0, 1, 1);
   }
   else {
 		if(verbose)
       print("Best AIC " + AIC_best + " achieved with feature: " + column_best);

     columns_fixed[1, column_best] = 1;
     columns_fixed_ordered[1, 1] = column_best;
     X_global = X_orig[, column_best];

     continue = TRUE
     while (continue) {
       # Subsequent passes over the features
       beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
       AICs_2 = matrix(0, 1, m_orig); # full overwrite
       parfor (i in 1:m_orig) {
         if (as.scalar(columns_fixed[1, i]) == 0) {
           # Construct the feature matrix
           Xi = cbind(X_global, X_orig[, i]);
           [AIC_2, beta_out_i2] = linear_regression(Xi, y, icpt, reg, tol, maxi, verbose);
           AICs_2[1, i] = AIC_2;
           beta_out_all_2[1:nrow(beta_out_i2), i] = beta_out_i2;
         }
         else {
           AICs_2[1,i] = Inf;
         }
       }

       # Determine the best AIC
       AIC_best_orig = AIC_best;
       AIC_best = min(min(AICs_2), AIC_best_orig);
       AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
       column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs_2)), column_best);

       # have the best beta store in the matrix
       beta_best = beta_out_all_2[, column_best];

       # Append best found features (i.e., columns) to X_global
       if (as.scalar(columns_fixed[1, column_best]) == 0) {
         # new best feature found
         if(verbose)
 				  print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
         columns_fixed[1, column_best] = 1;
         columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best));
         if (ncol(columns_fixed_ordered) == m_orig) {
           # all features examined
           X_global = cbind(X_global, X_orig[, column_best]);
           continue = FALSE;
         } else {
           X_global = cbind(X_global, X_orig[, column_best]);
         }
       } else {
         continue = FALSE;
       }
     }
     # run linear regression with selected set of features
     if( verbose )
 		  print("Running linear regression with selected features...");
     [AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, verbose);
     S = columns_fixed_ordered;
     if (icpt != 0)
       S = cbind(S, matrix(boa_ncol, 1, 1))
     B = reorder_matrix(boa_ncol, beta_out, S);
   }
 }

 # Computes linear regression using lm and outputs AIC.
 linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
   Double reg, Double tol, Integer maxi, Boolean verbose)
   return(Double AIC, Matrix[Double] beta)
 {
   # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
   beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, verbose=FALSE);

   # PREPARE X for SCORING
   if( icpt != 0 )
     X = cbind(X, matrix(1,nrow(X),1))

   # COMPUTE AIC
   n = nrow(X);
   y_residual = y - X %*% beta;
   ss_res = sum(y_residual ^ 2);
   AIC = (2 * ncol(X)) + n * log(ss_res / n);
 }

 reorder_matrix = function(
   double ncolX, # number of column in X, inlcuding the intercept column
   matrix[double] B, # beta
   matrix[double] S # Selected
 ) return(matrix[double] Y) {
   # This function assumes that B and S have same number of elements.
   # if the intercept is included in the model, all inputs should be adjusted
   # appropriately before calling this function.
   S = t(S);
   num_empty_B = ncolX - nrow(B);
   if (num_empty_B < 0) {
     stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
   }
   if (num_empty_B > 0) {
     pad_zeros = matrix(0, num_empty_B, 1);
     B = rbind(B, pad_zeros);
     S = rbind(S, pad_zeros);
   }
   # since the table won't accept zeros as index we hack it.
   S0 = replace(target = S, pattern = 0, replacement = ncolX + 1);
   seqS = seq(1, nrow(S0));
   P = table(seqS, S0, ncolX, ncolX);
   Y = t(P) %*% B;
 }

 checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return (Boolean R) {
   R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best))
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------


	#
	# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
	# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X String --- Location (on HDFS) to read the matrix X of feature vectors
	# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
	# B String --- Location to store estimated regression parameters (the betas)
	# S String --- Location to write the selected features ordered as computed by the algorithm
	# O String " " Location to write the printed statistics; by default is standard output
	# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
	# 0 = no intercept, no shifting, no rescaling;
	# 1 = add intercept, but neither shift nor rescale X;
	# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
	# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
	# no further features are being checked and the algorithm stops
	# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
	# write_beta Boolean TRUE Should the beta's be returned?
	# 0 = no
	# 1 = yes
	# --------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
	# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
	# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %% B[1:ncol(X), 1], or just X %% B
	# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# Col.2: betas for shifted/rescaled X and intercept
	#
	# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
	# name-value pair per each line, as follows:
	#
	# NAME MEANING
	# -------------------------------------------------------------------------------------
	# AVG_TOT_Y Average of the response value Y
	# STDEV_TOT_Y Standard Deviation of the response value Y
	# AVG_RES_Y Average of the residual Y - pred(Y\|X), i.e. residual bias

	m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
	Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
	return(Matrix[Double] B, Matrix[Double] S)
	{
	if( icpt!=0 & icpt!=1 & icpt!=2 )
	stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 0,1,2).");

	# NOTE: currently only the forward selection strategy in supported:
	# start from one feature and iteratively add features until AIC improves
	thr = 0.001;

	if(verbose)
	print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
	X_orig = X;
	n = nrow(X_orig);
	m_orig = ncol(X_orig);

	# BEGIN STEPWISE LINEAR REGRESSION
	columns_fixed = matrix(0, 1, m_orig);
	columns_fixed_ordered = matrix(0, 1, 1);

	# X_global stores the best model found at each step
	X_global = matrix(0, n, 1);

	if (icpt == 1 \| icpt == 2) {
	beta = mean(y);
	AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
	} else {
	beta = 0.0;
	AIC_best_orig = n * log(sum(y ^ 2) / n);
	}
	if(verbose)
	print("Best AIC without any features: " + AIC_best_orig);
	boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
	beta_out_all = matrix(0, boa_ncol, m_orig);

	# First pass to examine single features
	AICs = matrix(0, 1, m_orig);
	parfor (i in 1:m_orig) {
	[AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, maxi, verbose);
	AICs[1, i] = AIC_1;
	beta_out_all[1:nrow(beta_out_i), i] = beta_out_i;
	}
	AIC_best = min(min(AICs), AIC_best_orig);
	AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
	column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0);

	# beta best so far
	beta_best = beta_out_all[, column_best];
	if (column_best == 0) {
	if(verbose)
	print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
	B = matrix(0, m_orig, 1);
	if (icpt != 0)
	B = rbind(B, as.matrix(beta));
	S = matrix(0, 1, 1);
	}
	else {
	if(verbose)
	print("Best AIC " + AIC_best + " achieved with feature: " + column_best);

	columns_fixed[1, column_best] = 1;
	columns_fixed_ordered[1, 1] = column_best;
	X_global = X_orig[, column_best];

	continue = TRUE
	while (continue) {
	# Subsequent passes over the features
	beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
	AICs_2 = matrix(0, 1, m_orig); # full overwrite
	parfor (i in 1:m_orig) {
	if (as.scalar(columns_fixed[1, i]) == 0) {
	# Construct the feature matrix
	Xi = cbind(X_global, X_orig[, i]);
	[AIC_2, beta_out_i2] = linear_regression(Xi, y, icpt, reg, tol, maxi, verbose);
	AICs_2[1, i] = AIC_2;
	beta_out_all_2[1:nrow(beta_out_i2), i] = beta_out_i2;
	}
	else {
	AICs_2[1,i] = Inf;
	}
	}

	# Determine the best AIC
	AIC_best_orig = AIC_best;
	AIC_best = min(min(AICs_2), AIC_best_orig);
	AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
	column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs_2)), column_best);

	# have the best beta store in the matrix
	beta_best = beta_out_all_2[, column_best];

	# Append best found features (i.e., columns) to X_global
	if (as.scalar(columns_fixed[1, column_best]) == 0) {
	# new best feature found
	if(verbose)
	print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
	columns_fixed[1, column_best] = 1;
	columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best));
	if (ncol(columns_fixed_ordered) == m_orig) {
	# all features examined
	X_global = cbind(X_global, X_orig[, column_best]);
	continue = FALSE;
	} else {
	X_global = cbind(X_global, X_orig[, column_best]);
	}
	} else {
	continue = FALSE;
	}
	}
	# run linear regression with selected set of features
	if( verbose )
	print("Running linear regression with selected features...");
	[AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, verbose);
	S = columns_fixed_ordered;
	if (icpt != 0)
	S = cbind(S, matrix(boa_ncol, 1, 1))
	B = reorder_matrix(boa_ncol, beta_out, S);
	}
	}

	# Computes linear regression using lm and outputs AIC.
	linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
	Double reg, Double tol, Integer maxi, Boolean verbose)
	return(Double AIC, Matrix[Double] beta)
	{
	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
	beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, verbose=FALSE);

	# PREPARE X for SCORING
	if( icpt != 0 )
	X = cbind(X, matrix(1,nrow(X),1))

	# COMPUTE AIC
	n = nrow(X);
	y_residual = y - X %*% beta;
	ss_res = sum(y_residual ^ 2);
	AIC = (2 * ncol(X)) + n * log(ss_res / n);
	}

	reorder_matrix = function(
	double ncolX, # number of column in X, inlcuding the intercept column
	matrix[double] B, # beta
	matrix[double] S # Selected
	) return(matrix[double] Y) {
	# This function assumes that B and S have same number of elements.
	# if the intercept is included in the model, all inputs should be adjusted
	# appropriately before calling this function.
	S = t(S);
	num_empty_B = ncolX - nrow(B);
	if (num_empty_B < 0) {
	stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
	}
	if (num_empty_B > 0) {
	pad_zeros = matrix(0, num_empty_B, 1);
	B = rbind(B, pad_zeros);
	S = rbind(S, pad_zeros);
	}
	# since the table won't accept zeros as index we hack it.
	S0 = replace(target = S, pattern = 0, replacement = ncolX + 1);
	seqS = seq(1, nrow(S0));
	P = table(seqS, S0, ncolX, ncolX);
	Y = t(P) %*% B;
	}

	checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return (Boolean R) {
	R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best))
	}