blob: a8740f5884bfb773c65e7de5deb1ff85788650e5 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X String --- Location (on HDFS) to read the matrix X of feature vectors
# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
# B String --- Location to store estimated regression parameters (the betas)
# S String --- Location to write the selected features ordered as computed by the algorithm
# O String " " Location to write the printed statistics; by default is standard output
# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
# no further features are being checked and the algorithm stops
# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
# write_beta Boolean TRUE Should the beta's be returned?
# 0 = no
# 1 = yes
# --------------------------------------------------------------------------------------------
# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# Col.2: betas for shifted/rescaled X and intercept
#
# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
# name-value pair per each line, as follows:
#
# NAME MEANING
# -------------------------------------------------------------------------------------
# AVG_TOT_Y Average of the response value Y
# STDEV_TOT_Y Standard Deviation of the response value Y
# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
# R2 R^2 of residual with bias included vs. total average
# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
# R2_NOBIAS R^2 of residual with bias subtracted vs. total average
# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
# R2_VS_0 * R^2 of residual with bias included vs. zero constant
# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
# -------------------------------------------------------------------------------------
# * The last two statistics are only printed if there is no intercept (icpt=0)
# If the best AIC is achieved without any features the matrix of selected features contains 0.
# Moreover, in this case no further statistics will be produced
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemDS.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE
fileX = $X;
fileY = $Y;
fileB = $B;
fileS = $S;
write_beta = ifdef($write_beta, TRUE);
fmt = ifdef ($fmt, "text");
intercept = ifdef ($icpt, 1);
thr = ifdef ($thr, 0.001);
X_orig = read (fileX);
y = read (fileY);
[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE);
write(Selected, fileS, format=fmt);
write(beta_out, fileB, format=fmt);