blob: 892b3ae53f6c7d23619d98a7c2c75b93c9537bbb [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
args <- commandArgs(TRUE)
options(digits = 22)
library("Matrix")
reorder_matrix <- function(
ncolX, B, S) {
# This function assumes that B and S have same number of elements.
# if the intercept is included in the model, all inputs should be adjusted
# appropriately before calling this function.
S = t(S);
B = as.matrix(B);
num_empty_B = ncolX - nrow(B);
if (num_empty_B < 0) {
stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
}
if (num_empty_B > 0) {
pad_zeros = matrix(0, nrow = num_empty_B, ncol = 1);
B = rbind(B, pad_zeros);
S = rbind(S, pad_zeros);
}
S0 = matrix(0, nrow = nrow(S), ncol = ncol(S));
S0 = S;
# since the table won't accept zeros as index we hack it.
for (a in 1:nrow(S)) {
for (b in 1:ncol(S)) {
if (S[a, b] == 0) {
S0[a, b] = ncolX + 1;
}
}
}
seqS = seq(1, nrow(S0));
P = matrix(0, nrow = ncolX, ncol = ncolX);
for (a in 1:ncolX) {
for (b in 1:ncolX) {
if (nrow(S0) < b) {
P[a, b] = 0;
} else if (seqS[b] > ncolX | S0[b] > ncolX) {
} else {
x = seqS[b];
y = S0[b];
P[x, y] = 1;
}
}
}
Y = t(P) %*% B;
return(Y)
}
thr = 0.001;
intercept_status = 1;
X = as.matrix(readMM(paste(args[1], "A.mtx", sep = "")))
y = as.matrix(readMM(paste(args[1], "B.mtx", sep = "")))
# currently only the forward selection strategy in supported: start from one feature and iteratively add
# features until AIC improves
dir = 0;
stop = 0;
intercept_status = 1;
thr = 0.001;
print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
print("Reading X and Y...");
X_orig = X;
n = nrow(X_orig);
m_orig = ncol(X_orig);
# BEGIN STEPWISE LINEAR REGRESSION
if (dir == 0) {
continue = TRUE;
columns_fixed = matrix(0, 1, m_orig);
columns_fixed_ordered = matrix(0, 1, 1);
# X_global stores the best model found at each step
X_global = matrix(0, n, 1);
if (intercept_status == 1 | intercept_status == 2) {
beta = mean(y);
AIC_best = 2 + n * log(sum((beta - y) ^ 2) / n);
} else {
beta = 0.0;
AIC_best = n * log(sum(y ^ 2) / n);
}
AICs = matrix(t(AIC_best), 1, m_orig);
boa_ncol = ncol(X_orig);
if (intercept_status != 0) {
boa_ncol = boa_ncol + 1
}
beta_out_all = matrix(0, boa_ncol, m_orig * 1);
y_ncol = 1;
column_best = 0;
# First pass to examine single features
for (i in 1:m_orig) {
columns_fixed_ordered_1 = as.matrix(i);
x = as.matrix(X_orig[, i]);
beta_out_i = as.matrix(lm.fit(x, y)$coefficients)
# COMPUTE AIC
y_residual = y - x %*% beta_out_i;
ss_res = sum(y_residual ^ 2);
eq_deg_of_freedom = ncol(x);
AIC_1 = (2 * eq_deg_of_freedom) + n * log(ss_res / n);
AICs[1, i] = AIC_1;
AIC_cur = AICs[1, i];
if ((AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs(thr * AIC_best))) {
column_best = i;
AIC_best = AIC_cur;
}
beta_out_all[1:nrow(beta_out_i), ((i - 1) * y_ncol + 1):(i * y_ncol)] = beta_out_i[,1];
}
# beta best so far
beta_best = beta_out_all[, ((column_best - 1) * y_ncol + 1):(column_best * y_ncol)];
if (column_best == 0) {
Selected = matrix(0, nrow = 1, ncol = 1);
if (intercept_status == 0) {
B = matrix(beta, nrow = m_orig, ncol = 1);
} else {
B_tmp = matrix(0, nrow = m_orig + 1, ncol = 1);
B_tmp[m_orig + 1,] = beta;
B = B_tmp;
}
beta_out = B;
writeMM(as(beta_out, "CsparseMatrix"), paste(args[2], "C", sep = ""));
writeMM(as(Selected, "CsparseMatrix"), paste(args[2], "S", sep = ""));
stop = 1;
}
columns_fixed[1, column_best] = 1;
columns_fixed_ordered[1, 1] = column_best;
X_global = X_orig[, column_best];
while (continue) {
# Subsequent passes over the features
beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
for (i in 1:m_orig) {
if (columns_fixed[1, i] == 0) {
# Construct the feature matrix
X = cbind(X_global, X_orig[, i]);
tmp = matrix(0, nrow = 1, ncol = 1);
tmp[1, 1] = i;
columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp);
x = as.matrix(X);
n = nrow(x);
m = ncol(x);
# Introduce the intercept, shift and rescale the columns of X if needed
if (intercept_status == 1 | intercept_status == 2) {
# add the intercept column
ones_n = matrix(1, nrow = n, ncol = 1);
x = cbind(X_orig[, i], ones_n);
m = m - 1;
}
m_ext = ncol(x);
# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
beta_out_i = lm.fit(x, y)$coefficients
# COMPUTE AIC
y_residual = y - x %*% beta_out_i;
ss_res = sum(y_residual ^ 2);
eq_deg_of_freedom = m_ext;
AIC_2 = (2 * eq_deg_of_freedom) + n * log(ss_res / n);
b = as.matrix(beta_out_i)
beta_out_all_2[1:nrow(b), i:i] = b[1, 1];
if ((AIC_2 < AIC_best) & ((AIC_best - AIC_2) > abs(thr * AIC_best)) & (columns_fixed[1, i] == 0)) {
column_best = i;
AIC_best = AIC_2;
}
}
}
# have the best beta store in the matrix
beta_best = beta_out_all_2[, ((column_best - 1) * y_ncol + 1):(column_best * y_ncol)];
# Append best found features (i.e., columns) to X_global
if (is.null(columns_fixed[1, column_best])) {
# new best feature found
columns_fixed[1, column_best] = 1;
columns_fixed_ordered = cbind(columns_fixed_ordered, as.matrix(column_best));
if (ncol(columns_fixed_ordered) == m_orig) {
# all features examined
X_global = cbind(X_global, X_orig[, column_best]);
continue = FALSE;
} else {
X_global = cbind(X_global, X_orig[, column_best]);
}
} else {
continue = FALSE;
}
}
if (stop == 0) {
# run linear regression with selected set of features
print("Running linear regression with selected features...");
x = as.matrix(X_global);
n = nrow(x);
m = ncol(x);
# Introduce the intercept, shift and rescale the columns of X if needed
if (intercept_status == 1 | intercept_status == 2) {
# add the intercept column
ones_n = matrix(1, nrow = n, ncol = 1);
x = cbind(X_orig[, i], ones_n);
m = m - 1;
}
m_ext = ncol(x);
# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
beta_out = lm.fit(x, y)$coefficients
# COMPUTE AIC
y_residual = y - x %*% beta_out;
ss_res = sum(y_residual ^ 2);
eq_deg_of_freedom = m_ext;
AIC = (2 * eq_deg_of_freedom) + n * log(ss_res / n);
Selected = columns_fixed_ordered;
if (intercept_status != 0) {
Selected = cbind(Selected, matrix(boa_ncol, nrow = 1, ncol = 1))
}
beta_out = reorder_matrix(boa_ncol, beta_out, Selected);
writeMM(as(beta_out, "CsparseMatrix"), paste(args[2], "C", sep = ""));
writeMM(as(Selected[1], "CsparseMatrix"), paste(args[2], "S", sep = ""));
}
} else {
stop("Currently only forward selection strategy is supported!");
}