| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM |
| # |
| # INPUT PARAMETERS: |
| # -------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # -------------------------------------------------------------------------------------------- |
| # X String --- Location (on HDFS) to read the matrix X of feature vectors |
| # Y String --- Location (on HDFS) to read the 1-column matrix Y of response values |
| # B String --- Location to store estimated regression parameters (the betas) |
| # O String " " Location to write the printed statistics; by default is standard output |
| # Log String " " Location to write per-iteration variables for log/debugging purposes |
| # icpt Int 0 Intercept presence, shifting and rescaling the columns of X: |
| # 0 = no intercept, no shifting, no rescaling; |
| # 1 = add intercept, but neither shift nor rescale X; |
| # 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 |
| # reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero |
| # for highly dependend/sparse/numerous features |
| # tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if |
| # L2 norm of the beta-residual is less than tolerance * its initial norm |
| # maxi Int 0 Maximum number of conjugate gradient iterations, 0 = no maximum |
| # fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" |
| # -------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: |
| # OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: |
| # icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B |
| # icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] |
| # icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] |
| # Col.2: betas for shifted/rescaled X and intercept |
| # |
| # In addition, some regression statistics are provided in CSV format, one comma-separated |
| # name-value pair per each line, as follows: |
| # |
| # NAME MEANING |
| # ------------------------------------------------------------------------------------- |
| # AVG_TOT_Y Average of the response value Y |
| # STDEV_TOT_Y Standard Deviation of the response value Y |
| # AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias |
| # STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) |
| # DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. |
| # R2 R^2 of residual with bias included vs. total average |
| # ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average |
| # R2_NOBIAS R^2 of residual with bias subtracted vs. total average |
| # ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average |
| # R2_VS_0 * R^2 of residual with bias included vs. zero constant |
| # ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant |
| # ------------------------------------------------------------------------------------- |
| # * The last two statistics are only printed if there is no intercept (icpt=0) |
| # |
| # The Log file, when requested, contains the following per-iteration variables in CSV |
| # format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for |
| # initial values: |
| # |
| # NAME MEANING |
| # ------------------------------------------------------------------------------------- |
| # CG_RESIDUAL_NORM L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y |
| # where A = t(X) %*% X + diag (lambda), or a similar quantity |
| # CG_RESIDUAL_RATIO Ratio of current L2-norm of Conj.Grad.residual over the initial |
| # ------------------------------------------------------------------------------------- |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B |
| # O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv Log=OUTPUT_DIR/log |
| |
| fileX = $X; |
| fileY = $Y; |
| fileB = $B; |
| fileO = ifdef ($O, " "); |
| fileLog = ifdef ($Log, " "); |
| fmtB = ifdef ($fmt, "text"); |
| |
| intercept_status = ifdef ($icpt, 0); # $icpt=0; |
| tolerance = ifdef ($tol, 0.000001); # $tol=0.000001; |
| max_iteration = ifdef ($maxi, 0); # $maxi=0; |
| regularization = ifdef ($reg, 0.000001); # $reg=0.000001; |
| |
| print ("BEGIN LINEAR REGRESSION SCRIPT"); |
| print ("Reading X and Y..."); |
| X = read (fileX); |
| y = read (fileY); |
| |
| n = nrow (X); |
| m = ncol (X); |
| ones_n = matrix (1, rows = n, cols = 1); |
| zero_cell = matrix (0, rows = 1, cols = 1); |
| |
| # Introduce the intercept, shift and rescale the columns of X if needed |
| |
| m_ext = m; |
| if (intercept_status == 1 | intercept_status == 2) # add the intercept column |
| { |
| X = cbind (X, ones_n); |
| m_ext = ncol (X); |
| } |
| |
| scale_lambda = matrix (1, rows = m_ext, cols = 1); |
| if (intercept_status == 1 | intercept_status == 2) |
| { |
| scale_lambda [m_ext, 1] = 0; |
| } |
| |
| if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 |
| { # Important assumption: X [, m_ext] = ones_n |
| avg_X_cols = t(colSums(X)) / n; |
| var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); |
| is_unsafe = (var_X_cols <= 0); |
| scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); |
| scale_X [m_ext, 1] = 1; |
| shift_X = - avg_X_cols * scale_X; |
| shift_X [m_ext, 1] = 0; |
| } else { |
| scale_X = matrix (1, rows = m_ext, cols = 1); |
| shift_X = matrix (0, rows = m_ext, cols = 1); |
| } |
| |
| # Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)" |
| # instead of "X". However, in order to preserve the sparsity of X, |
| # we apply the transform associatively to some other part of the expression |
| # in which it occurs. To avoid materializing a large matrix, we rewrite it: |
| # |
| # ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: |
| # ssX_A = diag (scale_X) %*% A; |
| # ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A; |
| # |
| # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: |
| # tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; |
| |
| lambda = scale_lambda * regularization; |
| beta_unscaled = matrix (0, rows = m_ext, cols = 1); |
| |
| if (max_iteration == 0) { |
| max_iteration = m_ext; |
| } |
| i = 0; |
| |
| # BEGIN THE CONJUGATE GRADIENT ALGORITHM |
| print ("Running the CG algorithm..."); |
| |
| r = - t(X) %*% y; |
| |
| if (intercept_status == 2) { |
| r = scale_X * r + shift_X %*% r [m_ext, ]; |
| } |
| |
| p = - r; |
| norm_r2 = sum (r ^ 2); |
| norm_r2_initial = norm_r2; |
| norm_r2_target = norm_r2_initial * tolerance ^ 2; |
| print ("||r|| initial value = " + sqrt (norm_r2_initial) + ", target value = " + sqrt (norm_r2_target)); |
| log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial); |
| log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0"); |
| |
| while (i < max_iteration & norm_r2 > norm_r2_target) |
| { |
| if (intercept_status == 2) { |
| ssX_p = scale_X * p; |
| ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p; |
| } else { |
| ssX_p = p; |
| } |
| |
| q = t(X) %*% (X %*% ssX_p); |
| |
| if (intercept_status == 2) { |
| q = scale_X * q + shift_X %*% q [m_ext, ]; |
| } |
| |
| q += lambda * p; |
| a = norm_r2 / sum (p * q); |
| beta_unscaled += a * p; |
| r += a * q; |
| old_norm_r2 = norm_r2; |
| norm_r2 = sum (r ^ 2); |
| p = -r + (norm_r2 / old_norm_r2) * p; |
| i = i + 1; |
| print ("Iteration " + i + ": ||r|| / ||r init|| = " + sqrt (norm_r2 / norm_r2_initial)); |
| log_str = append (log_str, "CG_RESIDUAL_NORM," + i + "," + sqrt (norm_r2)); |
| log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 / norm_r2_initial)); |
| } |
| |
| if (i >= max_iteration) { |
| print ("Warning: the maximum number of iterations has been reached."); |
| } |
| print ("The CG algorithm is done."); |
| # END THE CONJUGATE GRADIENT ALGORITHM |
| |
| if (intercept_status == 2) { |
| beta = scale_X * beta_unscaled; |
| beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; |
| } else { |
| beta = beta_unscaled; |
| } |
| |
| print ("Computing the statistics..."); |
| |
| avg_tot = sum (y) / n; |
| ss_tot = sum (y ^ 2); |
| ss_avg_tot = ss_tot - n * avg_tot ^ 2; |
| var_tot = ss_avg_tot / (n - 1); |
| y_residual = y - X %*% beta; |
| avg_res = sum (y_residual) / n; |
| ss_res = sum (y_residual ^ 2); |
| ss_avg_res = ss_res - n * avg_res ^ 2; |
| |
| R2 = 1 - ss_res / ss_avg_tot; |
| dispersion = ifelse(n > m_ext, ss_res / (n - m_ext), NaN); |
| adjusted_R2 = ifelse(n > m_ext, 1 - dispersion / (ss_avg_tot / (n - 1)), NaN); |
| |
| R2_nobias = 1 - ss_avg_res / ss_avg_tot; |
| deg_freedom = n - m - 1; |
| if (deg_freedom > 0) { |
| var_res = ss_avg_res / deg_freedom; |
| adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); |
| } else { |
| var_res = NaN; |
| adjusted_R2_nobias = NaN; |
| print ("Warning: zero or negative number of degrees of freedom."); |
| } |
| |
| R2_vs_0 = 1 - ss_res / ss_tot; |
| adjusted_R2_vs_0 = ifelse(n > m, 1 - (ss_res / (n - m)) / (ss_tot / n), NaN); |
| |
| str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y |
| str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y |
| str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias |
| str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) |
| str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. |
| str = append (str, "R2," + R2); # R^2 of residual with bias included vs. total average |
| str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average |
| str = append (str, "R2_NOBIAS," + R2_nobias); # R^2 of residual with bias subtracted vs. total average |
| str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average |
| if (intercept_status == 0) { |
| str = append (str, "R2_VS_0," + R2_vs_0); # R^2 of residual with bias included vs. zero constant |
| str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant |
| } |
| |
| if (fileO != " ") { |
| write (str, fileO); |
| } else { |
| print (str); |
| } |
| |
| # Prepare the output matrix |
| print ("Writing the output matrix..."); |
| |
| if (intercept_status == 2) { |
| beta_out = cbind (beta, beta_unscaled); |
| } else { |
| beta_out = beta; |
| } |
| write (beta_out, fileB, format=fmtB); |
| |
| if (fileLog != " ") { |
| write (log_str, fileLog); |
| } |
| print ("END LINEAR REGRESSION SCRIPT"); |