| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| Fin = rbind(read($1), read($2), read($3), read($4)) |
| |
| y = read($5) |
| |
| # one hot encoding categorical, other passthrough |
| Fall = as.frame(Fin) |
| jspec = "{ ids:true, dummycode:[1] }" |
| [X,M] = transformencode(target=Fall, spec=jspec) |
| print("ncol(X) = "+ncol(X)) |
| |
| # clipping out of value ranges |
| colSD = colSds(X) |
| colMean = (colMeans(X)) |
| upperBound = colMean + 1.5 * colSD |
| lowerBound = colMean - 1.5 * colSD |
| outFilter = (X < lowerBound) | (X > upperBound) |
| X = X - outFilter*X + outFilter*colMeans(X); |
| |
| # normalization |
| X = scale(X=X, center=TRUE, scale=TRUE); |
| |
| # split training and testing |
| [Xtrain , Xtest, ytrain, ytest] = split(X=X, Y=y, cont=$6, seed=7) |
| |
| # train regression model |
| B = lm(X=Xtrain, y=ytrain, icpt=1, reg=1e-3, tol=1e-9, verbose=TRUE) |
| |
| # model evaluation on test split |
| yhat = lmpredict(X=Xtest, w=B, icpt=1); |
| y_residual = ytest - yhat; |
| |
| avg_res = sum(y_residual) / nrow(ytest); |
| ss_res = sum(y_residual^2); |
| ss_avg_res = ss_res - nrow(ytest) * avg_res^2; |
| R2 = 1 - ss_res / (sum(y^2) - nrow(ytest) * (sum(y)/nrow(ytest))^2); |
| print("\nAccuracy:" + |
| "\n--sum(ytest) = " + sum(ytest) + |
| "\n--sum(yhat) = " + sum(yhat) + |
| "\n--AVG_RES_Y: " + avg_res + |
| "\n--SS_AVG_RES_Y: " + ss_avg_res + |
| "\n--R2: " + R2 ); |
| |
| # write trained model and meta data |
| write(B, $7) |