scripts/algorithms/StepLinearRegDS.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
 # EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME        TYPE    DEFAULT    MEANING
 # --------------------------------------------------------------------------------------------
 # X           String     ---   Location (on HDFS) to read the matrix X of feature vectors
 # Y           String     ---   Location (on HDFS) to read the 1-column matrix Y of response values
 # B           String     ---   Location to store estimated regression parameters (the betas)
 # S           String     ---   Location to write the selected features ordered as computed by the algorithm
 # O           String     " "   Location to write the printed statistics; by default is standard output
 # icpt        Int        0     Intercept presence, shifting and rescaling the columns of X:
 #                              0 = no intercept, no shifting, no rescaling;
 #                              1 = add intercept, but neither shift nor rescale X;
 #                              2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # thr         Double    0.01   Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
 #                              no further features are being checked and the algorithm stops
 # fmt         String   "text"  Matrix output format for B (the betas) only, usually "text" or "csv"
 # write_beta  Boolean   TRUE   Should the beta's be returned?
 #                              0 = no
 #                              1 = yes
 # --------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
 #         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
 # icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
 # icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 # icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 #                        Col.2: betas for shifted/rescaled X and intercept
 #
 # In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
 # name-value pair per each line, as follows:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------
 # AVG_TOT_Y             Average of the response value Y
 # STDEV_TOT_Y           Standard Deviation of the response value Y
 # AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
 # STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
 # DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
 # R2                    R^2 of residual with bias included vs. total average
 # ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
 # R2_NOBIAS             R^2 of residual with bias subtracted vs. total average
 # ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
 # R2_VS_0               * R^2 of residual with bias included vs. zero constant
 # ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
 # -------------------------------------------------------------------------------------
 # * The last two statistics are only printed if there is no intercept (icpt=0)
 # If the best AIC is achieved without any features the matrix of selected features contains 0.
 # Moreover, in this case no further statistics will be produced
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemDS.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
 #     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE

 fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileS = $S;
 write_beta = ifdef($write_beta, TRUE);
 fmt  = ifdef ($fmt, "text");
 intercept = ifdef ($icpt, 1);
 thr = ifdef ($thr, 0.001);

 X_orig = read (fileX);
 y = read (fileY);

 [beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE);

 write(Selected, fileS, format=fmt);
 write(beta_out, fileB, format=fmt);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
	# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X String --- Location (on HDFS) to read the matrix X of feature vectors
	# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
	# B String --- Location to store estimated regression parameters (the betas)
	# S String --- Location to write the selected features ordered as computed by the algorithm
	# O String " " Location to write the printed statistics; by default is standard output
	# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
	# 0 = no intercept, no shifting, no rescaling;
	# 1 = add intercept, but neither shift nor rescale X;
	# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
	# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
	# no further features are being checked and the algorithm stops
	# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
	# write_beta Boolean TRUE Should the beta's be returned?
	# 0 = no
	# 1 = yes
	# --------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
	# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
	# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %% B[1:ncol(X), 1], or just X %% B
	# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# Col.2: betas for shifted/rescaled X and intercept
	#
	# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
	# name-value pair per each line, as follows:
	#
	# NAME MEANING
	# -------------------------------------------------------------------------------------
	# AVG_TOT_Y Average of the response value Y
	# STDEV_TOT_Y Standard Deviation of the response value Y
	# AVG_RES_Y Average of the residual Y - pred(Y\|X), i.e. residual bias
	# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y\|X)
	# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
	# R2 R^2 of residual with bias included vs. total average
	# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
	# R2_NOBIAS R^2 of residual with bias subtracted vs. total average
	# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
	# R2_VS_0 * R^2 of residual with bias included vs. zero constant
	# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
	# -------------------------------------------------------------------------------------
	# * The last two statistics are only printed if there is no intercept (icpt=0)
	# If the best AIC is achieved without any features the matrix of selected features contains 0.
	# Moreover, in this case no further statistics will be produced
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemDS.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
	# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv write_beta=TRUE

	fileX = $X;
	fileY = $Y;
	fileB = $B;
	fileS = $S;
	write_beta = ifdef($write_beta, TRUE);
	fmt = ifdef ($fmt, "text");
	intercept = ifdef ($icpt, 1);
	thr = ifdef ($thr, 0.001);

	X_orig = read (fileX);
	y = read (fileY);

	[beta_out, Selected] = steplm(X=X_orig, y=y, icpt=intercept, verbose=FALSE);

	write(Selected, fileS, format=fmt);
	write(beta_out, fileB, format=fmt);