| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| checkR2 = function(Matrix[double] X, Matrix[double] y, Matrix[double] y_p, |
| Matrix[double] beta, Integer icpt) return (Double R2_ad) |
| { |
| n = nrow(X); |
| m = ncol(X); |
| m_ext = m; |
| if (icpt == 1|icpt == 2) |
| m_ext = m+1; #due to extra column ones |
| avg_tot = sum(y)/n; |
| ss_tot = sum(y^2); |
| ss_avg_tot = ss_tot - n*avg_tot^2; |
| y_res = y - y_p; |
| avg_res = sum(y - y_p)/n; |
| ss_res = sum((y - y_p)^2); |
| R2 = 1 - ss_res/ss_avg_tot; |
| dispersion = ifelse(n>m_ext, ss_res/(n-m_ext), NaN); |
| R2_ad = ifelse(n>m_ext, 1-dispersion/(ss_avg_tot/(n-1)), NaN); |
| } |
| |
| |
| PCA = function(Matrix[Double] A, Integer K = ncol(A), Integer center = 1, Integer scale = 1, |
| Integer projectData = 1) return(Matrix[Double] newA) |
| { |
| evec_dominant = matrix(0,cols=1,rows=1); |
| |
| N = nrow(A); |
| D = ncol(A); |
| print("K = "+K); |
| |
| # perform z-scoring (centering and scaling) |
| A = scale(A, center==1, scale==1); |
| |
| # co-variance matrix |
| mu = colSums(A)/N; |
| C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu; |
| |
| # compute eigen vectors and values |
| [evalues, evectors] = eigen(C); |
| |
| decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE); |
| diagmat = table(seq(1,D),decreasing_Idx); |
| # sorts eigenvalues by decreasing order |
| evalues = diagmat %*% evalues; |
| # sorts eigenvectors column-wise in the order of decreasing eigenvalues |
| evectors = evectors %*% diagmat; |
| |
| |
| # select K dominant eigen vectors |
| nvec = ncol(evectors); |
| |
| eval_dominant = evalues[1:K, 1]; |
| evec_dominant = evectors[,1:K]; |
| |
| # the square root of eigenvalues |
| eval_stdev_dominant = sqrt(eval_dominant); |
| |
| if (projectData == 1){ |
| # Construct new data set by treating computed dominant eigenvectors as the basis vectors |
| newA = A %*% evec_dominant; |
| } |
| } |
| |
| # Get the dataset |
| M = 1000; |
| A = rand(rows=M, cols=100, seed=1); |
| y = rand(rows=M, cols=1, seed=2); |
| R = matrix(0, rows=1, cols=20); |
| |
| Kc = floor(ncol(A) * 0.8); |
| |
| for (i in 1:10) { |
| newA1 = PCA(A=A, K=Kc+i); |
| beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE); |
| y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1); |
| R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1); |
| R[,i] = R2_ad1; |
| } |
| |
| parfor (i in 1:10) { |
| newA3 = PCA(A=A, K=Kc+5); |
| beta3 = lm(X=newA3, y=y, icpt=1, reg=0.001*i, verbose=FALSE); |
| y_predict3 = lmpredict(X=newA3, w=beta3, icpt=1); |
| R2_ad3 = checkR2(newA3, y, y_predict3, beta3, 1); |
| R[,10+i] = R2_ad3; |
| } |
| |
| |
| write(R, $1); |