| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC |
| # EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y |
| # |
| # INPUT PARAMETERS: |
| # -------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # -------------------------------------------------------------------------------------------- |
| # X String --- Location (on HDFS) to read the matrix X of feature vectors |
| # Y String --- Location (on HDFS) to read the 1-column matrix Y of response values |
| # B String --- Location to store estimated regression parameters (the betas) |
| # S String --- Location to write the selected features ordered as computed by the algorithm |
| # O String " " Location to write the printed statistics; by default is standard output |
| # icpt Int 0 Intercept presence, shifting and rescaling the columns of X: |
| # 0 = no intercept, no shifting, no rescaling; |
| # 1 = add intercept, but neither shift nor rescale X; |
| # 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 |
| # thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr |
| # no further features are being checked and the algorithm stops |
| # fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" |
| # write_beta Boolean TRUE Should the beta's be returned? |
| # 0 = no |
| # 1 = yes |
| # -------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: |
| # OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: |
| # icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B |
| # icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] |
| # icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] |
| # Col.2: betas for shifted/rescaled X and intercept |
| # |
| # In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated |
| # name-value pair per each line, as follows: |
| # |
| # NAME MEANING |
| # ------------------------------------------------------------------------------------- |
| # AVG_TOT_Y Average of the response value Y |
| # STDEV_TOT_Y Standard Deviation of the response value Y |
| # AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias |
| # STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) |
| # DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. |
| # R2 R^2 of residual with bias included vs. total average |
| # ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average |
| # R2_NOBIAS R^2 of residual with bias subtracted vs. total average |
| # ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average |
| # R2_VS_0 * R^2 of residual with bias included vs. zero constant |
| # ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant |
| # ------------------------------------------------------------------------------------- |
| # * The last two statistics are only printed if there is no intercept (icpt=0) |
| # If the best AIC is achieved without any features the matrix of selected features contains 0. |
| # Moreover, in this case no further statistics will be produced |
| # |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas |
| # O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE |
| |
| fileX = $X; |
| fileY = $Y; |
| fileB = $B; |
| fileS = $S; |
| write_beta = ifdef($write_beta, TRUE); |
| fmt = ifdef ($fmt, "text"); |
| intercept = ifdef ($icpt, 1); |
| thr = ifdef ($thr, 0.001); |
| |
| X_orig = read (fileX); |
| y = read (fileY); |
| |
| [beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE); |
| |
| write(Selected, fileS, format=fmt); |
| write(beta_out, fileB, format=fmt); |