src/test/scripts/functions/recompile/IPAFunctionArgsFor.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 checkR2 = function(Matrix[double] X, Matrix[double] y, Matrix[double] y_p,
           Matrix[double] beta, Integer icpt) return (Double R2_ad)
 {
   n = nrow(X);
   m = ncol(X);
   m_ext = m;
   if (icpt == 1|icpt == 2)
       m_ext = m+1; #due to extra column ones
   avg_tot = sum(y)/n;
   ss_tot = sum(y^2);
   ss_avg_tot = ss_tot - n*avg_tot^2;
   y_res = y - y_p;
   avg_res = sum(y - y_p)/n;
   ss_res = sum((y - y_p)^2);
   R2 = 1 - ss_res/ss_avg_tot;
   dispersion = ifelse(n>m_ext, ss_res/(n-m_ext), NaN);
   R2_ad = ifelse(n>m_ext, 1-dispersion/(ss_avg_tot/(n-1)), NaN);
 }


 PCA = function(Matrix[Double] A, Integer K = ncol(A), Integer center = 1, Integer scale = 1,
     Integer projectData = 1) return(Matrix[Double] newA)
 {
   evec_dominant = matrix(0,cols=1,rows=1);

   N = nrow(A);
   D = ncol(A);
   print("K = "+K);

   # perform z-scoring (centering and scaling)
   A = scale(A, center==1, scale==1);

   # co-variance matrix
   mu = colSums(A)/N;
   C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu;

   # compute eigen vectors and values
   [evalues, evectors] = eigen(C);

   decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
   diagmat = table(seq(1,D),decreasing_Idx);
   # sorts eigenvalues by decreasing order
   evalues = diagmat %*% evalues;
   # sorts eigenvectors column-wise in the order of decreasing eigenvalues
   evectors = evectors %*% diagmat;


   # select K dominant eigen vectors
   nvec = ncol(evectors);

   eval_dominant = evalues[1:K, 1];
   evec_dominant = evectors[,1:K];

   # the square root of eigenvalues
   eval_stdev_dominant = sqrt(eval_dominant);

   if (projectData == 1){
     # Construct new data set by treating computed dominant eigenvectors as the basis vectors
     newA = A %*% evec_dominant;
   }
 }

 # Get the dataset
 M = 1000;
 A = rand(rows=M, cols=100, seed=1);
 y = rand(rows=M, cols=1, seed=2);
 R = matrix(0, rows=1, cols=20);

 Kc = floor(ncol(A) * 0.8);

 for (i in 1:10) {
   newA1 = PCA(A=A, K=Kc+i);
   beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE);
   y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1);
   R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1);
   R[,i] = R2_ad1;
 }

 for (i in 1:10) {
   newA3 = PCA(A=A, K=Kc+5);
   beta3 = lm(X=newA3, y=y, icpt=1, reg=0.001*i, verbose=FALSE);
   y_predict3 = lmpredict(X=newA3, w=beta3, icpt=1);
   R2_ad3 = checkR2(newA3, y, y_predict3, beta3, 1);
   R[,10+i] = R2_ad3;
 }


 write(R, $1);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	checkR2 = function(Matrix[double] X, Matrix[double] y, Matrix[double] y_p,
	Matrix[double] beta, Integer icpt) return (Double R2_ad)
	{
	n = nrow(X);
	m = ncol(X);
	m_ext = m;
	if (icpt == 1\|icpt == 2)
	m_ext = m+1; #due to extra column ones
	avg_tot = sum(y)/n;
	ss_tot = sum(y^2);
	ss_avg_tot = ss_tot - n*avg_tot^2;
	y_res = y - y_p;
	avg_res = sum(y - y_p)/n;
	ss_res = sum((y - y_p)^2);
	R2 = 1 - ss_res/ss_avg_tot;
	dispersion = ifelse(n>m_ext, ss_res/(n-m_ext), NaN);
	R2_ad = ifelse(n>m_ext, 1-dispersion/(ss_avg_tot/(n-1)), NaN);
	}


	PCA = function(Matrix[Double] A, Integer K = ncol(A), Integer center = 1, Integer scale = 1,
	Integer projectData = 1) return(Matrix[Double] newA)
	{
	evec_dominant = matrix(0,cols=1,rows=1);

	N = nrow(A);
	D = ncol(A);
	print("K = "+K);

	# perform z-scoring (centering and scaling)
	A = scale(A, center==1, scale==1);

	# co-variance matrix
	mu = colSums(A)/N;
	C = (t(A) %% A)/(N-1) - (N/(N-1))t(mu) %*% mu;

	# compute eigen vectors and values
	[evalues, evectors] = eigen(C);

	decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE);
	diagmat = table(seq(1,D),decreasing_Idx);
	# sorts eigenvalues by decreasing order
	evalues = diagmat %*% evalues;
	# sorts eigenvectors column-wise in the order of decreasing eigenvalues
	evectors = evectors %*% diagmat;


	# select K dominant eigen vectors
	nvec = ncol(evectors);

	eval_dominant = evalues[1:K, 1];
	evec_dominant = evectors[,1:K];

	# the square root of eigenvalues
	eval_stdev_dominant = sqrt(eval_dominant);

	if (projectData == 1){
	# Construct new data set by treating computed dominant eigenvectors as the basis vectors
	newA = A %*% evec_dominant;
	}
	}

	# Get the dataset
	M = 1000;
	A = rand(rows=M, cols=100, seed=1);
	y = rand(rows=M, cols=1, seed=2);
	R = matrix(0, rows=1, cols=20);

	Kc = floor(ncol(A) * 0.8);

	for (i in 1:10) {
	newA1 = PCA(A=A, K=Kc+i);
	beta1 = lm(X=newA1, y=y, icpt=1, reg=0.0001, verbose=FALSE);
	y_predict1 = lmpredict(X=newA1, w=beta1, icpt=1);
	R2_ad1 = checkR2(newA1, y, y_predict1, beta1, 1);
	R[,i] = R2_ad1;
	}

	for (i in 1:10) {
	newA3 = PCA(A=A, K=Kc+5);
	beta3 = lm(X=newA3, y=y, icpt=1, reg=0.001*i, verbose=FALSE);
	y_predict3 = lmpredict(X=newA3, w=beta3, icpt=1);
	R2_ad3 = checkR2(newA3, y, y_predict3, beta3, 1);
	R[,10+i] = R2_ad3;
	}


	write(R, $1);