src/test/scripts/functions/federated/FederatedLmPipeline.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 Fin = federated(addresses=list($in_X1, $in_X2),
     ranges=list(list(0, 0), list($rows / 2, $cols), list($rows / 2, 0), list($rows, $cols)))
 y = read($in_Y)

 # one hot encoding categorical, other passthrough
 Fall = as.frame(Fin)
 jspec = "{ ids:true, dummycode:[1] }"
 [X,M] = transformencode(target=Fall, spec=jspec)
 print("ncol(X) = "+ncol(X))

 # clipping out of value ranges
 colSD = colSds(X)
 colMean = (colMeans(X))
 upperBound = colMean + 1.5 * colSD
 lowerBound = colMean - 1.5 * colSD
 outFilter = (X < lowerBound) | (X > upperBound)
 X = X - outFilter*X + outFilter*colMeans(X);

 # normalization
 X = scale(X=X, center=TRUE, scale=TRUE);

 # split training and testing
 [Xtrain , Xtest, ytrain, ytest] = split(X=X, Y=y, cont=$cont, seed=7)

 # train regression model
 B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE)

 # model evaluation on test split
 yhat = lmpredict(X=Xtest, w=B, icpt=1);
 y_residual = ytest - yhat;

 avg_res = sum(y_residual) / nrow(ytest);
 ss_res = sum(y_residual^2);
 ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
 R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2);
 print("\nAccuracy:" +
       "\n--sum(ytest) = " + sum(ytest) +
       "\n--sum(yhat) = " + sum(yhat) +
       "\n--AVG_RES_Y: " + avg_res +
       "\n--SS_AVG_RES_Y: " + ss_avg_res +
       "\n--R2: " + R2 );

 # write trained model and meta data
 write(B, $out)
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	Fin = federated(addresses=list($in_X1, $in_X2),
	ranges=list(list(0, 0), list($rows / 2, $cols), list($rows / 2, 0), list($rows, $cols)))
	y = read($in_Y)

	# one hot encoding categorical, other passthrough
	Fall = as.frame(Fin)
	jspec = "{ ids:true, dummycode:[1] }"
	[X,M] = transformencode(target=Fall, spec=jspec)
	print("ncol(X) = "+ncol(X))

	# clipping out of value ranges
	colSD = colSds(X)
	colMean = (colMeans(X))
	upperBound = colMean + 1.5 * colSD
	lowerBound = colMean - 1.5 * colSD
	outFilter = (X < lowerBound) \| (X > upperBound)
	X = X - outFilterX + outFiltercolMeans(X);

	# normalization
	X = scale(X=X, center=TRUE, scale=TRUE);

	# split training and testing
	[Xtrain , Xtest, ytrain, ytest] = split(X=X, Y=y, cont=$cont, seed=7)

	# train regression model
	B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE)

	# model evaluation on test split
	yhat = lmpredict(X=Xtest, w=B, icpt=1);
	y_residual = ytest - yhat;

	avg_res = sum(y_residual) / nrow(ytest);
	ss_res = sum(y_residual^2);
	ss_avg_res = ss_res - nrow(ytest) * avg_res^2;
	R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2);
	print("\nAccuracy:" +
	"\n--sum(ytest) = " + sum(ytest) +
	"\n--sum(yhat) = " + sum(yhat) +
	"\n--AVG_RES_Y: " + avg_res +
	"\n--SS_AVG_RES_Y: " + ss_avg_res +
	"\n--R2: " + R2 );

	# write trained model and meta data
	write(B, $out)