blob: 8c643c33a156d83c3df57a6bf3099d4100eef11c [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X String --- Location (on HDFS) to read the matrix X of feature vectors
# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
# B String --- Location to store estimated regression parameters (the betas)
# S String --- Location to write the selected features ordered as computed by the algorithm
# O String " " Location to write the printed statistics; by default is standard output
# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
# no further features are being checked and the algorithm stops
# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
# --------------------------------------------------------------------------------------------
# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# Col.2: betas for shifted/rescaled X and intercept
#
# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
# name-value pair per each line, as follows:
#
# NAME MEANING
# -------------------------------------------------------------------------------------
# AVG_TOT_Y Average of the response value Y
# STDEV_TOT_Y Standard Deviation of the response value Y
# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
# -------------------------------------------------------------------------------------
# * The last two statistics are only printed if there is no intercept (icpt=0)
# If the best AIC is achieved without any features the matrix of selected features contains 0.
# Moreover, in this case no further statistics will be produced
#
# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
fileX = $X;
fileY = $Y;
fileB = $B;
fileS = $S;
# currently only the forward selection strategy in supported: start from one feature and iteratively add
# features until AIC improves
dir = "forward";
fmt = ifdef ($fmt, "text");
intercept_status = ifdef ($icpt, 0);
thr = ifdef ($thr, 0.01);
print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
print ("Reading X and Y...");
X_orig = read (fileX);
y = read (fileY);
n = nrow (X_orig);
m_orig = ncol (X_orig);
# BEGIN STEPWISE LINEAR REGRESSION
if (dir == "forward") {
continue = TRUE;
columns_fixed = matrix (0, rows = 1, cols = m_orig);
columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
# X_global stores the best model found at each step
X_global = matrix (0, rows = n, cols = 1);
if (intercept_status == 1 | intercept_status == 2) {
beta = mean (y);
AIC_best = 2 + n * log(sum((beta - y)^2) / n);
} else {
beta = 0;
AIC_best = n * log(sum(y^2) / n);
}
AICs = matrix (AIC_best, rows = 1, cols = m_orig);
print ("Best AIC without any features: " + AIC_best);
# First pass to examine single features
parfor (i in 1:m_orig) {
[AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " ");
AICs[1,i] = AIC_1;
}
# Determine the best AIC
column_best = 0;
for (k in 1:m_orig) {
AIC_cur = as.scalar (AICs[1,k]);
if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) {
column_best = k;
AIC_best = as.scalar(AICs[1,k]);
}
}
if (column_best == 0) {
print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
S = matrix (0, rows=1, cols=1);
if (intercept_status == 0) {
B = matrix (beta, rows = m_orig, cols = 1);
} else {
B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
B_tmp[m_orig + 1,] = beta;
B = B_tmp;
}
write (S, fileS, format=fmt);
write (B, fileB, format=fmt);
stop ("");
}
print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1,column_best] = 1;
columns_fixed_ordered[1,1] = column_best;
X_global = X_orig[,column_best];
while (continue) {
# Subsequent passes over the features
parfor (i in 1:m_orig) {
if (as.scalar(columns_fixed[1,i]) == 0) {
# Construct the feature matrix
X = append (X_global, X_orig[,i]);
[AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " ");
AICs[1,i] = AIC_2;
}
}
# Determine the best AIC
for (k in 1:m_orig) {
AIC_cur = as.scalar (AICs[1,k]);
if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
column_best = k;
AIC_best = as.scalar(AICs[1,k]);
}
}
# Append best found features (i.e., columns) to X_global
if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found
print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1,column_best] = 1;
columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best));
if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
X_global = append (X_global, X_orig[,column_best]);
continue = FALSE;
} else {
X_global = append (X_global, X_orig[,column_best]);
}
} else {
continue = FALSE;
}
}
# run linear regression with selected set of features
print ("Running linear regression with selected features...");
[AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB);
} else {
stop ("Currently only forward selection strategy is supported!");
}
/*
* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
* It also outputs the AIC of the computed model.
*/
linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
intercept_status = ifdef ($icpt, 0);
fmt = ifdef ($fmt, "text");
n = nrow (X);
m = ncol (X);
# Introduce the intercept, shift and rescale the columns of X if needed
if (intercept_status == 1 | intercept_status == 2) { # add the intercept column
ones_n = matrix (1, rows = n, cols = 1);
X = append (X, ones_n);
m = m - 1;
}
m_ext = ncol(X);
if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1
# Important assumption: X [, m_ext] = ones_n
avg_X_cols = t(colSums(X)) / n;
var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
is_unsafe = (var_X_cols <= 0);
scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
scale_X [m_ext, 1] = 1;
shift_X = - avg_X_cols * scale_X;
shift_X [m_ext, 1] = 0;
} else {
scale_X = matrix (1, rows = m_ext, cols = 1);
shift_X = matrix (0, rows = m_ext, cols = 1);
}
# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
A = t(X) %*% X;
b = t(X) %*% y;
if (intercept_status == 2) {
A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
}
beta_unscaled = solve (A, b);
# END THE DIRECT SOLVE ALGORITHM
if (intercept_status == 2) {
beta = scale_X * beta_unscaled;
beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
} else {
beta = beta_unscaled;
}
# COMPUTE AIC
y_residual = y - X %*% beta;
ss_res = sum (y_residual ^ 2);
eq_deg_of_freedom = m_ext;
AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
if (fileB != " ") {
fileO = ifdef ($O, " ");
fileS = $S;
print ("Computing the statistics...");
avg_tot = sum (y) / n;
ss_tot = sum (y ^ 2);
ss_avg_tot = ss_tot - n * avg_tot ^ 2;
var_tot = ss_avg_tot / (n - 1);
# y_residual = y - X %*% beta;
avg_res = sum (y_residual) / n;
# ss_res = sum (y_residual ^ 2);
ss_avg_res = ss_res - n * avg_res ^ 2;
plain_R2 = 1 - ss_res / ss_avg_tot;
if (n > m_ext) {
dispersion = ss_res / (n - m_ext);
adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
} else {
dispersion = 0.0 / 0.0;
adjusted_R2 = 0.0 / 0.0;
}
plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
deg_freedom = n - m - 1;
if (deg_freedom > 0) {
var_res = ss_avg_res / deg_freedom;
adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
} else {
var_res = 0.0 / 0.0;
adjusted_R2_nobias = 0.0 / 0.0;
print ("Warning: zero or negative number of degrees of freedom.");
}
plain_R2_vs_0 = 1 - ss_res / ss_tot;
if (n > m) {
adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
} else {
adjusted_R2_vs_0 = 0.0 / 0.0;
}
str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
if (intercept_status == 0) {
str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
}
if (fileO != " ") {
write (str, fileO);
} else {
print (str);
}
# Prepare the output matrix
print ("Writing the output matrix...");
if (intercept_status == 2) {
beta_out = append (beta, beta_unscaled);
} else {
beta_out = beta;
}
# Output which features give the best AIC and are being used for linear regression
write (Selected, fileS, format=fmt);
no_selected = ncol (Selected);
max_selected = max (Selected);
last = max_selected + 1;
if (intercept_status != 0) {
Selected_ext = append (Selected, as.matrix (last));
P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext));
if (intercept_status == 2) {
P1_beta = P1 * beta;
P2_beta = colSums (P1_beta);
P1_beta_unscaled = P1 * beta_unscaled;
P2_beta_unscaled = colSums(P1_beta_unscaled);
if (max_selected < m_orig) {
P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1];
P2_beta[1, max_selected + 1] = 0;
P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1];
P2_beta_unscaled[1, max_selected + 1] = 0;
}
beta_out = append (t(P2_beta), t(P2_beta_unscaled));
} else {
P1_beta = P1 * beta;
P2_beta = colSums (P1_beta);
if (max_selected < m_orig) {
P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ;
P2_beta[1, max_selected + 1] = 0;
}
beta_out = t(P2_beta);
}
} else {
P1 = table (seq (1, no_selected), t(Selected));
P1_beta = P1 * beta;
P2_beta = colSums (P1_beta);
if (max_selected < m_orig) {
P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected)));
}
beta_out = t(P2_beta);
}
write ( beta_out, fileB, format=fmt );
}
}