blob: 96b2355bf46ad8c8e4c3e4e293c89648b3ced825 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
checkR2 = function(Matrix[double] X, Matrix[double] y, Matrix[double] y_p,
Matrix[double] beta, Integer icpt) return (Double R2_ad)
{
n = nrow(X);
m = ncol(X);
m_ext = m;
if (icpt == 1|icpt == 2)
m_ext = m+1; #due to extra column ones
avg_tot = sum(y)/n;
ss_tot = sum(y^2);
ss_avg_tot = ss_tot - n*avg_tot^2;
y_res = y - y_p;
avg_res = sum(y - y_p)/n;
ss_res = sum((y - y_p)^2);
R2 = 1 - ss_res/ss_avg_tot;
dispersion = ifelse(n>m_ext, ss_res/(n-m_ext), NaN);
R2_ad = ifelse(n>m_ext, 1-dispersion/(ss_avg_tot/(n-1)), NaN);
}
PCA = function(Matrix[Double] A, Integer K = ncol(A), Integer center = 1, Integer scale = 1,
Integer projectData = 1) return(Matrix[Double] newA)
{
evec_dominant = matrix(0,cols=1,rows=1);
N = nrow(A);
D = ncol(A);
print("K = "+K);
# perform z-scoring (centering and scaling)
A = scale(A, center==1, scale==1);
# co-variance matrix
mu = colSums(A)/N;
C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;
# compute eigen vectors and values
[evalues, evectors] = eigen(C);
decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
diagmat = table(seq(1,D),decreasing_Idx);
# sorts eigenvalues by decreasing order
evalues = diagmat %*% evalues;
# sorts eigenvectors column-wise in the order of decreasing eigenvalues
evectors = evectors %*% diagmat;
# select K dominant eigen vectors
nvec = ncol(evectors);
eval_dominant = evalues[1:K, 1];
evec_dominant = evectors[,1:K];
# the square root of eigenvalues
eval_stdev_dominant = sqrt(eval_dominant);
if (projectData == 1){
# Construct new data set by treating computed dominant eigenvectors as the basis vectors
newA = A %*% evec_dominant;
}
}
# Get the dataset
M = 1000;
A = rand(rows=M, cols=100, seed=1);
y = rand(rows=M, cols=1, seed=2);
R = matrix(0, rows=1, cols=20);
Kc = floor(ncol(A) * 0.8);
for (i in 1:10) {
newA1 = PCA(A=A, K=Kc+i);
beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE);
y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1);
R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1);
R[,i] = R2_ad1;
}
for (i in 1:10) {
newA3 = PCA(A=A, K=Kc+5);
beta3 = lm(X=newA3, y=y, icpt=1, reg=0.001*i, verbose=FALSE);
y_predict3 = lmpredict(X=newA3, w=beta3, icpt=1);
R2_ad3 = checkR2(newA3, y, y_predict3, beta3, 1);
R[,10+i] = R2_ad3;
}
write(R, $1);