| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # This script generates random data for linear regression. A matrix is generated |
| # consisting of a data matrix with a label column appended to it. |
| # |
| # INPUT PARAMETERS: |
| # -------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # -------------------------------------------------------------------------------------------- |
| # numSamples Int --- Number of samples |
| # numFeatures Int --- Number of features (independent variables) |
| # maxFeatureValue Int --- Maximum feature value (absolute value) |
| # maxWeight Int --- Maximum weight (absolute value) |
| # addNoise Boolean --- Determines whether noise should be added to Y |
| # b Double --- Intercept |
| # sparsity Double --- Controls the sparsity in the generated data (a value between 0 and 1) |
| # output String --- Location to write the generated data/label matrix |
| # format String --- Matrix output format |
| # perc Double 0.8 Percentage of training sample |
| # percFile String --- File to store the percentages |
| # -------------------------------------------------------------------------------------------- |
| # OUTPUT: Matrix of random data with appended label column |
| # --------------------------------------------------------------------------------------------- |
| # |
| # Example |
| # ./runStandaloneSystemML.sh algorithms/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv |
| # |
| |
| perc = ifdef($perc, 0.8) |
| percFile = ifdef($percFile, "perc.csv") |
| p = matrix(0, rows=2, cols=1) |
| p[1,1] = perc |
| p[2,1] = (1-perc) |
| write(p, percFile, format="csv") |
| |
| X = Rand(cols=$numFeatures, max=1, min=-1, pdf="uniform", rows=$numSamples, seed=0, sparsity=$sparsity) |
| X = X * $maxFeatureValue |
| |
| w = Rand(cols=1, max=1, min=-1, pdf="uniform", rows=$numFeatures, seed=0) |
| w = w * $maxWeight |
| |
| Y = X %*% w |
| Y = Y + $b |
| |
| if ($addNoise == TRUE) { |
| noise = Rand(cols=1, pdf="normal", rows=$numSamples, seed=0) |
| Y = Y + noise |
| } |
| |
| Z = append(X,Y) |
| write(Z, $output, format=$format) |