blob: 800c2caee9e13c03550b22e977820727ae2ac3e0 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X String --- Location (on HDFS) to read the matrix X of feature vectors
# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
# B String --- Location to store estimated regression parameters (the betas)
# S String --- Location to write the selected features ordered as computed by the algorithm
# O String " " Location to write the printed statistics; by default is standard output
# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
# 0 = no intercept, no shifting, no rescaling;
# 1 = add intercept, but neither shift nor rescale X;
# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
# no further features are being checked and the algorithm stops
# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
# write_beta Boolean TRUE Should the beta's be returned?
# 0 = no
# 1 = yes
# --------------------------------------------------------------------------------------------
# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# Col.2: betas for shifted/rescaled X and intercept
#
# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
# name-value pair per each line, as follows:
#
# NAME MEANING
# -------------------------------------------------------------------------------------
# AVG_TOT_Y Average of the response value Y
# STDEV_TOT_Y Standard Deviation of the response value Y
# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
return(Matrix[Double] B, Matrix[Double] S)
{
if( icpt!=0 & icpt!=1 & icpt!=2 )
stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 0,1,2).");
# NOTE: currently only the forward selection strategy in supported:
# start from one feature and iteratively add features until AIC improves
thr = 0.001;
if(verbose)
print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
X_orig = X;
n = nrow(X_orig);
m_orig = ncol(X_orig);
# BEGIN STEPWISE LINEAR REGRESSION
columns_fixed = matrix(0, 1, m_orig);
columns_fixed_ordered = matrix(0, 1, 1);
# X_global stores the best model found at each step
X_global = matrix(0, n, 1);
if (icpt == 1 | icpt == 2) {
beta = mean(y);
AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
} else {
beta = 0.0;
AIC_best_orig = n * log(sum(y ^ 2) / n);
}
if(verbose)
print("Best AIC without any features: " + AIC_best_orig);
boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
beta_out_all = matrix(0, boa_ncol, m_orig);
# First pass to examine single features
AICs = matrix(0, 1, m_orig);
parfor (i in 1:m_orig) {
[AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, maxi, verbose);
AICs[1, i] = AIC_1;
beta_out_all[1:nrow(beta_out_i), i] = beta_out_i;
}
AIC_best = min(min(AICs), AIC_best_orig);
AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0);
# beta best so far
beta_best = beta_out_all[, column_best];
if (column_best == 0) {
if(verbose)
print("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!");
B = matrix(0, m_orig, 1);
if (icpt != 0)
B = rbind(B, as.matrix(beta));
S = matrix(0, 1, 1);
}
else {
if(verbose)
print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1, column_best] = 1;
columns_fixed_ordered[1, 1] = column_best;
X_global = X_orig[, column_best];
continue = TRUE
while (continue) {
# Subsequent passes over the features
beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
AICs_2 = matrix(0, 1, m_orig); # full overwrite
parfor (i in 1:m_orig) {
if (as.scalar(columns_fixed[1, i]) == 0) {
# Construct the feature matrix
Xi = cbind(X_global, X_orig[, i]);
[AIC_2, beta_out_i2] = linear_regression(Xi, y, icpt, reg, tol, maxi, verbose);
AICs_2[1, i] = AIC_2;
beta_out_all_2[1:nrow(beta_out_i2), i] = beta_out_i2;
}
else {
AICs_2[1,i] = Inf;
}
}
# Determine the best AIC
AIC_best_orig = AIC_best;
AIC_best = min(min(AICs_2), AIC_best_orig);
AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs_2)), column_best);
# have the best beta store in the matrix
beta_best = beta_out_all_2[, column_best];
# Append best found features (i.e., columns) to X_global
if (as.scalar(columns_fixed[1, column_best]) == 0) {
# new best feature found
if(verbose)
print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
columns_fixed[1, column_best] = 1;
columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best));
if (ncol(columns_fixed_ordered) == m_orig) {
# all features examined
X_global = cbind(X_global, X_orig[, column_best]);
continue = FALSE;
} else {
X_global = cbind(X_global, X_orig[, column_best]);
}
} else {
continue = FALSE;
}
}
# run linear regression with selected set of features
if( verbose )
print("Running linear regression with selected features...");
[AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, verbose);
S = columns_fixed_ordered;
if (icpt != 0)
S = cbind(S, matrix(boa_ncol, 1, 1))
B = reorder_matrix(boa_ncol, beta_out, S);
}
}
# Computes linear regression using lm and outputs AIC.
linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt,
Double reg, Double tol, Integer maxi, Boolean verbose)
return(Double AIC, Matrix[Double] beta)
{
# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, verbose=FALSE);
# PREPARE X for SCORING
if( icpt != 0 )
X = cbind(X, matrix(1,nrow(X),1))
# COMPUTE AIC
n = nrow(X);
y_residual = y - X %*% beta;
ss_res = sum(y_residual ^ 2);
AIC = (2 * ncol(X)) + n * log(ss_res / n);
}
reorder_matrix = function(
double ncolX, # number of column in X, inlcuding the intercept column
matrix[double] B, # beta
matrix[double] S # Selected
) return(matrix[double] Y) {
# This function assumes that B and S have same number of elements.
# if the intercept is included in the model, all inputs should be adjusted
# appropriately before calling this function.
S = t(S);
num_empty_B = ncolX - nrow(B);
if (num_empty_B < 0) {
stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
}
if (num_empty_B > 0) {
pad_zeros = matrix(0, num_empty_B, 1);
B = rbind(B, pad_zeros);
S = rbind(S, pad_zeros);
}
# since the table won't accept zeros as index we hack it.
S0 = replace(target = S, pattern = 0, replacement = ncolX + 1);
seqS = seq(1, nrow(S0));
P = table(seqS, S0, ncolX, ncolX);
Y = t(P) %*% B;
}
checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return (Boolean R) {
R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best))
}