blob: 00ac55d1e8ad9e14923b8abb145bbabb5ce8fbcd [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# This script generates random data for linear regression. A matrix is generated
# consisting of a data matrix with a label column appended to it.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# numSamples Int --- Number of samples
# numFeatures Int --- Number of features (independent variables)
# maxFeatureValue Int --- Maximum feature value (absolute value)
# maxWeight Int --- Maximum weight (absolute value)
# addNoise Boolean --- Determines whether noise should be added to Y
# b Double --- Intercept
# sparsity Double --- Controls the sparsity in the generated data (a value between 0 and 1)
# output String --- Location to write the generated data/label matrix
# format String --- Matrix output format
# perc Double 0.8 Percentage of training sample
# percFile String --- File to store the percentages
# --------------------------------------------------------------------------------------------
# OUTPUT: Matrix of random data with appended label column
# ---------------------------------------------------------------------------------------------
#
# Example
# ./runStandaloneSystemML.sh algorithms/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv
#
perc = ifdef($perc, 0.8)
percFile = ifdef($percFile, "perc.csv")
p = matrix(0, rows=2, cols=1)
p[1,1] = perc
p[2,1] = (1-perc)
write(p, percFile, format="csv")
X = Rand(cols=$numFeatures, max=1, min=-1, pdf="uniform", rows=$numSamples, seed=0, sparsity=$sparsity)
X = X * $maxFeatureValue
w = Rand(cols=1, max=1, min=-1, pdf="uniform", rows=$numFeatures, seed=0)
w = w * $maxWeight
Y = X %*% w
Y = Y + $b
if ($addNoise == TRUE) {
noise = Rand(cols=1, pdf="normal", rows=$numSamples, seed=0)
Y = Y + noise
}
Z = append(X,Y)
write(Z, $output, format=$format)