scripts/algorithms/LinearRegDS.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + lambda) beta = X^T y
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME  TYPE   DEFAULT  MEANING
 # --------------------------------------------------------------------------------------------
 # X     String  ---     Location (on HDFS) to read the matrix X of feature vectors
 # Y     String  ---     Location (on HDFS) to read the 1-column matrix Y of response values
 # B     String  ---     Location to store estimated regression parameters (the betas)
 # O     String  " "     Location to write the printed statistics; by default is standard output
 # icpt  Int      0      Intercept presence, shifting and rescaling the columns of X:
 #                       0 = no intercept, no shifting, no rescaling;
 #                       1 = add intercept, but neither shift nor rescale X;
 #                       2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
 # reg   Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero
 #                       for highly dependend/sparse/numerous features
 # fmt   String "text"   Matrix output format for B (the betas) only, usually "text" or "csv"
 # --------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
 #         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM X AND B:
 # icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
 # icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 # icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
 #                        Col.2: betas for shifted/rescaled X and intercept
 #
 # In addition, some regression statistics are provided in CSV format, one comma-separated
 # name-value pair per each line, as follows:
 #
 # NAME                  MEANING
 # -------------------------------------------------------------------------------------
 # AVG_TOT_Y             Average of the response value Y
 # STDEV_TOT_Y           Standard Deviation of the response value Y
 # AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual bias
 # STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
 # DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
 # R2                    R^2 of residual with bias included vs. total average
 # ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total average
 # R2_NOBIAS             R^2 of residual with bias subtracted vs. total average
 # ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. total average
 # R2_VS_0               * R^2 of residual with bias included vs. zero constant
 # ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero constant
 # -------------------------------------------------------------------------------------
 # * The last two statistics are only printed if there is no intercept (icpt=0)
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemDS.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B
 #     O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv

 fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileO = ifdef ($O, " ");
 fmtB  = ifdef ($fmt, "text");

 intercept_status = ifdef ($icpt, 0);     # $icpt=0;
 regularization = ifdef ($reg, 0.000001); # $reg=0.000001;

 print ("BEGIN LINEAR REGRESSION SCRIPT");
 print ("Reading X and Y...");
 X = read (fileX);
 y = read (fileY);

 n = nrow (X);
 m = ncol (X);
 ones_n = matrix (1, rows = n, cols = 1);
 zero_cell = matrix (0, rows = 1, cols = 1);

 # Introduce the intercept, shift and rescale the columns of X if needed

 m_ext = m;
 if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
 {
     X = cbind (X, ones_n);
     m_ext = ncol (X);
 }

 scale_lambda = matrix (1, rows = m_ext, cols = 1);
 if (intercept_status == 1 | intercept_status == 2)
 {
     scale_lambda [m_ext, 1] = 0;
 }

 if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
 {                           # Important assumption: X [, m_ext] = ones_n
     avg_X_cols = t(colSums(X)) / n;
     var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
     is_unsafe = (var_X_cols <= 0);
     scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
     scale_X [m_ext, 1] = 1;
     shift_X = - avg_X_cols * scale_X;
     shift_X [m_ext, 1] = 0;
 } else {
     scale_X = matrix (1, rows = m_ext, cols = 1);
     shift_X = matrix (0, rows = m_ext, cols = 1);
 }

 # Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
 # instead of "X".  However, in order to preserve the sparsity of X,
 # we apply the transform associatively to some other part of the expression
 # in which it occurs.  To avoid materializing a large matrix, we rewrite it:
 #
 # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
 # ssX_A  = diag (scale_X) %*% A;
 # ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
 #
 # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
 # tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];

 lambda = scale_lambda * regularization;

 # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

 A = t(X) %*% X;
 b = t(X) %*% y;
 if (intercept_status == 2) {
     A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
     A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
     b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
 }
 A = A + diag (lambda);

 print ("Calling the Direct Solver...");

 beta_unscaled = solve (A, b);

 # END THE DIRECT SOLVE ALGORITHM

 if (intercept_status == 2) {
     beta = scale_X * beta_unscaled;
     beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
 } else {
     beta = beta_unscaled;
 }

 print ("Computing the statistics...");

 avg_tot = sum (y) / n;
 ss_tot = sum (y ^ 2);
 ss_avg_tot = ss_tot - n * avg_tot ^ 2;
 var_tot = ss_avg_tot / (n - 1);
 y_residual = y - X %*% beta;
 avg_res = sum (y_residual) / n;
 ss_res = sum (y_residual ^ 2);
 ss_avg_res = ss_res - n * avg_res ^ 2;

 R2 = 1 - ss_res / ss_avg_tot;
 dispersion = ifelse(n > m_ext, ss_res / (n - m_ext), NaN);
 adjusted_R2 = ifelse(n > m_ext, 1 - dispersion / (ss_avg_tot / (n - 1)), NaN);

 R2_nobias = 1 - ss_avg_res / ss_avg_tot;
 deg_freedom = n - m - 1;
 if (deg_freedom > 0) {
     var_res = ss_avg_res / deg_freedom;
     adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
 } else {
     var_res = NaN;
     adjusted_R2_nobias = NaN;
     print ("Warning: zero or negative number of degrees of freedom.");
 }

 R2_vs_0 = 1 - ss_res / ss_tot;
 adjusted_R2_vs_0 = ifelse(n > m, 1 - (ss_res / (n - m)) / (ss_tot / n), NaN);

 str = "AVG_TOT_Y," + avg_tot;                                    #  Average of the response value Y
 str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard Deviation of the response value Y
 str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of the residual Y - pred(Y|X), i.e. residual bias
 str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard Deviation of the residual Y - pred(Y|X)
 str = append (str, "DISPERSION," + dispersion);                  #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
 str = append (str, "R2," + R2);                                  #  R^2 of residual with bias included vs. total average
 str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted R^2 of residual with bias included vs. total average
 str = append (str, "R2_NOBIAS," + R2_nobias);                    #  R^2 of residual with bias subtracted vs. total average
 str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted R^2 of residual with bias subtracted vs. total average
 if (intercept_status == 0) {
     str = append (str, "R2_VS_0," + R2_vs_0);                    #  R^2 of residual with bias included vs. zero constant
     str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero constant
 }

 if (fileO != " ") {
     write (str, fileO);
 } else {
     print (str);
 }

 # Prepare the output matrix
 print ("Writing the output matrix...");

 if (intercept_status == 2) {
     beta_out = cbind (beta, beta_unscaled);
 } else {
     beta_out = beta;
 }
 write (beta_out, fileB, format=fmtB);
 print ("END LINEAR REGRESSION SCRIPT");
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + lambda) beta = X^T y
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X String --- Location (on HDFS) to read the matrix X of feature vectors
	# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
	# B String --- Location to store estimated regression parameters (the betas)
	# O String " " Location to write the printed statistics; by default is standard output
	# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
	# 0 = no intercept, no shifting, no rescaling;
	# 1 = add intercept, but neither shift nor rescale X;
	# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
	# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero
	# for highly dependend/sparse/numerous features
	# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
	# --------------------------------------------------------------------------------------------
	# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
	# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
	# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %% B[1:ncol(X), 1], or just X %% B
	# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
	# Col.2: betas for shifted/rescaled X and intercept
	#
	# In addition, some regression statistics are provided in CSV format, one comma-separated
	# name-value pair per each line, as follows:
	#
	# NAME MEANING
	# -------------------------------------------------------------------------------------
	# AVG_TOT_Y Average of the response value Y
	# STDEV_TOT_Y Standard Deviation of the response value Y
	# AVG_RES_Y Average of the residual Y - pred(Y\|X), i.e. residual bias
	# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y\|X)
	# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
	# R2 R^2 of residual with bias included vs. total average
	# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
	# R2_NOBIAS R^2 of residual with bias subtracted vs. total average
	# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
	# R2_VS_0 * R^2 of residual with bias included vs. zero constant
	# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
	# -------------------------------------------------------------------------------------
	# * The last two statistics are only printed if there is no intercept (icpt=0)
	#
	# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
	# hadoop jar SystemDS.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B
	# O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv

	fileX = $X;
	fileY = $Y;
	fileB = $B;
	fileO = ifdef ($O, " ");
	fmtB = ifdef ($fmt, "text");

	intercept_status = ifdef ($icpt, 0); # $icpt=0;
	regularization = ifdef ($reg, 0.000001); # $reg=0.000001;

	print ("BEGIN LINEAR REGRESSION SCRIPT");
	print ("Reading X and Y...");
	X = read (fileX);
	y = read (fileY);

	n = nrow (X);
	m = ncol (X);
	ones_n = matrix (1, rows = n, cols = 1);
	zero_cell = matrix (0, rows = 1, cols = 1);

	# Introduce the intercept, shift and rescale the columns of X if needed

	m_ext = m;
	if (intercept_status == 1 \| intercept_status == 2) # add the intercept column
	{
	X = cbind (X, ones_n);
	m_ext = ncol (X);
	}

	scale_lambda = matrix (1, rows = m_ext, cols = 1);
	if (intercept_status == 1 \| intercept_status == 2)
	{
	scale_lambda [m_ext, 1] = 0;
	}

	if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
	{ # Important assumption: X [, m_ext] = ones_n
	avg_X_cols = t(colSums(X)) / n;
	var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
	is_unsafe = (var_X_cols <= 0);
	scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
	scale_X [m_ext, 1] = 1;
	shift_X = - avg_X_cols * scale_X;
	shift_X [m_ext, 1] = 0;
	} else {
	scale_X = matrix (1, rows = m_ext, cols = 1);
	shift_X = matrix (0, rows = m_ext, cols = 1);
	}

	# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
	# instead of "X". However, in order to preserve the sparsity of X,
	# we apply the transform associatively to some other part of the expression
	# in which it occurs. To avoid materializing a large matrix, we rewrite it:
	#
	# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
	# ssX_A = diag (scale_X) %*% A;
	# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
	#
	# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
	# tssX_A = diag (scale_X) %% A + shift_X %% A [m_ext, ];

	lambda = scale_lambda * regularization;

	# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)

	A = t(X) %*% X;
	b = t(X) %*% y;
	if (intercept_status == 2) {
	A = t(diag (scale_X) %% A + shift_X %% A [m_ext, ]);
	A = diag (scale_X) %% A + shift_X %% A [m_ext, ];
	b = diag (scale_X) %% b + shift_X %% b [m_ext, ];
	}
	A = A + diag (lambda);

	print ("Calling the Direct Solver...");

	beta_unscaled = solve (A, b);

	# END THE DIRECT SOLVE ALGORITHM

	if (intercept_status == 2) {
	beta = scale_X * beta_unscaled;
	beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
	} else {
	beta = beta_unscaled;
	}

	print ("Computing the statistics...");

	avg_tot = sum (y) / n;
	ss_tot = sum (y ^ 2);
	ss_avg_tot = ss_tot - n * avg_tot ^ 2;
	var_tot = ss_avg_tot / (n - 1);
	y_residual = y - X %*% beta;
	avg_res = sum (y_residual) / n;
	ss_res = sum (y_residual ^ 2);
	ss_avg_res = ss_res - n * avg_res ^ 2;

	R2 = 1 - ss_res / ss_avg_tot;
	dispersion = ifelse(n > m_ext, ss_res / (n - m_ext), NaN);
	adjusted_R2 = ifelse(n > m_ext, 1 - dispersion / (ss_avg_tot / (n - 1)), NaN);

	R2_nobias = 1 - ss_avg_res / ss_avg_tot;
	deg_freedom = n - m - 1;
	if (deg_freedom > 0) {
	var_res = ss_avg_res / deg_freedom;
	adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
	} else {
	var_res = NaN;
	adjusted_R2_nobias = NaN;
	print ("Warning: zero or negative number of degrees of freedom.");
	}

	R2_vs_0 = 1 - ss_res / ss_tot;
	adjusted_R2_vs_0 = ifelse(n > m, 1 - (ss_res / (n - m)) / (ss_tot / n), NaN);

	str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
	str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
	str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y\|X), i.e. residual bias
	str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y\|X)
	str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
	str = append (str, "R2," + R2); # R^2 of residual with bias included vs. total average
	str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
	str = append (str, "R2_NOBIAS," + R2_nobias); # R^2 of residual with bias subtracted vs. total average
	str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
	if (intercept_status == 0) {
	str = append (str, "R2_VS_0," + R2_vs_0); # R^2 of residual with bias included vs. zero constant
	str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
	}

	if (fileO != " ") {
	write (str, fileO);
	} else {
	print (str);
	}

	# Prepare the output matrix
	print ("Writing the output matrix...");

	if (intercept_status == 2) {
	beta_out = cbind (beta, beta_unscaled);
	} else {
	beta_out = beta;
	}
	write (beta_out, fileB, format=fmtB);
	print ("END LINEAR REGRESSION SCRIPT");