| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| |
| # |
| # THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC |
| # EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y |
| # |
| # INPUT PARAMETERS: |
| # -------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # -------------------------------------------------------------------------------------------- |
| # X String --- Location (on HDFS) to read the matrix X of feature vectors |
| # Y String --- Location (on HDFS) to read the 1-column matrix Y of response values |
| # B String --- Location to store estimated regression parameters (the betas) |
| # S String --- Location to write the selected features ordered as computed by the algorithm |
| # O String " " Location to write the printed statistics; by default is standard output |
| # icpt Int 0 Intercept presence, shifting and rescaling the columns of X: |
| # 0 = no intercept, no shifting, no rescaling; |
| # 1 = add intercept, but neither shift nor rescale X; |
| # 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 |
| # thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr |
| # no further features are being checked and the algorithm stops |
| # fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" |
| # write_beta Boolean TRUE Should the beta's be returned? |
| # 0 = no |
| # 1 = yes |
| # -------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: |
| # OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: |
| # icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B |
| # icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] |
| # icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] |
| # Col.2: betas for shifted/rescaled X and intercept |
| # |
| # In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated |
| # name-value pair per each line, as follows: |
| # |
| # NAME MEANING |
| # ------------------------------------------------------------------------------------- |
| # AVG_TOT_Y Average of the response value Y |
| # STDEV_TOT_Y Standard Deviation of the response value Y |
| # AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias |
| |
| m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, |
| Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE) |
| return(Matrix[Double] B, Matrix[Double] S) |
| { |
| if( icpt!=0 & icpt!=1 & icpt!=2 ) |
| stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 0,1,2)."); |
| |
| # NOTE: currently only the forward selection strategy in supported: |
| # start from one feature and iteratively add features until AIC improves |
| thr = 0.001; |
| |
| if(verbose) |
| print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT"); |
| X_orig = X; |
| n = nrow(X_orig); |
| m_orig = ncol(X_orig); |
| |
| # BEGIN STEPWISE LINEAR REGRESSION |
| columns_fixed = matrix(0, 1, m_orig); |
| columns_fixed_ordered = matrix(0, 1, 1); |
| |
| # X_global stores the best model found at each step |
| X_global = matrix(0, n, 1); |
| |
| if (icpt == 1 | icpt == 2) { |
| beta = mean(y); |
| AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n); |
| } else { |
| beta = 0.0; |
| AIC_best_orig = n * log(sum(y ^ 2) / n); |
| } |
| if(verbose) |
| print("Best AIC without any features: " + AIC_best_orig); |
| boa_ncol = ncol(X_orig) + as.integer(icpt!=0); |
| beta_out_all = matrix(0, boa_ncol, m_orig); |
| |
| # First pass to examine single features |
| AICs = matrix(0, 1, m_orig); |
| parfor (i in 1:m_orig) { |
| [AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, maxi, verbose); |
| AICs[1, i] = AIC_1; |
| beta_out_all[1:nrow(beta_out_i), i] = beta_out_i; |
| } |
| AIC_best = min(min(AICs), AIC_best_orig); |
| AIC_check = checkAIC(AIC_best, AIC_best_orig, thr); |
| column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0); |
| |
| # beta best so far |
| beta_best = beta_out_all[, column_best]; |
| if (column_best == 0) { |
| if(verbose) |
| print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); |
| B = matrix(0, m_orig, 1); |
| if (icpt != 0) |
| B = rbind(B, as.matrix(beta)); |
| S = matrix(0, 1, 1); |
| } |
| else { |
| if(verbose) |
| print("Best AIC " + AIC_best + " achieved with feature: " + column_best); |
| |
| columns_fixed[1, column_best] = 1; |
| columns_fixed_ordered[1, 1] = column_best; |
| X_global = X_orig[, column_best]; |
| |
| continue = TRUE |
| while (continue) { |
| # Subsequent passes over the features |
| beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1); |
| AICs_2 = matrix(0, 1, m_orig); # full overwrite |
| parfor (i in 1:m_orig) { |
| if (as.scalar(columns_fixed[1, i]) == 0) { |
| # Construct the feature matrix |
| Xi = cbind(X_global, X_orig[, i]); |
| [AIC_2, beta_out_i2] = linear_regression(Xi, y, icpt, reg, tol, maxi, verbose); |
| AICs_2[1, i] = AIC_2; |
| beta_out_all_2[1:nrow(beta_out_i2), i] = beta_out_i2; |
| } |
| else { |
| AICs_2[1,i] = Inf; |
| } |
| } |
| |
| # Determine the best AIC |
| AIC_best_orig = AIC_best; |
| AIC_best = min(min(AICs_2), AIC_best_orig); |
| AIC_check = checkAIC(AIC_best, AIC_best_orig, thr); |
| column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs_2)), column_best); |
| |
| # have the best beta store in the matrix |
| beta_best = beta_out_all_2[, column_best]; |
| |
| # Append best found features (i.e., columns) to X_global |
| if (as.scalar(columns_fixed[1, column_best]) == 0) { |
| # new best feature found |
| if(verbose) |
| print("Best AIC " + AIC_best + " achieved with feature: " + column_best); |
| columns_fixed[1, column_best] = 1; |
| columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best)); |
| if (ncol(columns_fixed_ordered) == m_orig) { |
| # all features examined |
| X_global = cbind(X_global, X_orig[, column_best]); |
| continue = FALSE; |
| } else { |
| X_global = cbind(X_global, X_orig[, column_best]); |
| } |
| } else { |
| continue = FALSE; |
| } |
| } |
| # run linear regression with selected set of features |
| if( verbose ) |
| print("Running linear regression with selected features..."); |
| [AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, verbose); |
| S = columns_fixed_ordered; |
| if (icpt != 0) |
| S = cbind(S, matrix(boa_ncol, 1, 1)) |
| B = reorder_matrix(boa_ncol, beta_out, S); |
| } |
| } |
| |
| # Computes linear regression using lm and outputs AIC. |
| linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt, |
| Double reg, Double tol, Integer maxi, Boolean verbose) |
| return(Double AIC, Matrix[Double] beta) |
| { |
| # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL) |
| beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, verbose=FALSE); |
| |
| # PREPARE X for SCORING |
| if( icpt != 0 ) |
| X = cbind(X, matrix(1,nrow(X),1)) |
| |
| # COMPUTE AIC |
| n = nrow(X); |
| y_residual = y - X %*% beta; |
| ss_res = sum(y_residual ^ 2); |
| AIC = (2 * ncol(X)) + n * log(ss_res / n); |
| } |
| |
| reorder_matrix = function( |
| double ncolX, # number of column in X, inlcuding the intercept column |
| matrix[double] B, # beta |
| matrix[double] S # Selected |
| ) return(matrix[double] Y) { |
| # This function assumes that B and S have same number of elements. |
| # if the intercept is included in the model, all inputs should be adjusted |
| # appropriately before calling this function. |
| S = t(S); |
| num_empty_B = ncolX - nrow(B); |
| if (num_empty_B < 0) { |
| stop("Error: unable to re-order the matrix. Reason: B more than matrix X"); |
| } |
| if (num_empty_B > 0) { |
| pad_zeros = matrix(0, num_empty_B, 1); |
| B = rbind(B, pad_zeros); |
| S = rbind(S, pad_zeros); |
| } |
| # since the table won't accept zeros as index we hack it. |
| S0 = replace(target = S, pattern = 0, replacement = ncolX + 1); |
| seqS = seq(1, nrow(S0)); |
| P = table(seqS, S0, ncolX, ncolX); |
| Y = t(P) %*% B; |
| } |
| |
| checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return (Boolean R) { |
| R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best)) |
| } |