| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # generates random data to test linear logistic regression |
| |
| # 100K dataset |
| # hadoop jar SystemDS.jar -f genRandData4LogisticRegression.dml -args 100000 500 0.0 5.0 itau/logreg/w_100k itau/logreg/X_100k_500 itau/logreg/y_100k 0 0 0.01 |
| |
| # 1M dataset |
| # hadoop jar SystemDS.jar -f genRandData4LogisticRegression.dml -args 1000000 1000 0.0 5.0 itau/logreg/w_1m itau/logreg/X_1m_1k /logreg/y_1m 0 0 0.0001 |
| |
| # $1 is number of samples |
| # $2 is number of features (independent variables) |
| # $3 is the mean of the linear form (w^T X) |
| # $4 is the st.dev. of the linear form (w^T X) |
| # $5 is location to store generated weights |
| # $6 is location to store generated data |
| # $7 is location to store generated labels |
| # $8 addNoise. if 0 then no noise is added, to add noise set this to 1 |
| # $9 is 0 if no intercept and 1 if there is intercept |
| # $10 controls sparsity in the generated data |
| |
| numSamples = $1 |
| numFeatures = $2 |
| meanLF = $3 |
| sigmaLF = $4 |
| addNoise = $8 |
| b = $9 |
| |
| X = Rand (rows=numSamples, cols=numFeatures, min=-1, max=2, pdf="uniform", seed=0, sparsity=$10); |
| w = Rand (rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) |
| |
| if (b != 0) { |
| b_mat = Rand (rows=numSamples, cols=1, min=1, max=1); |
| X = cbind (X, b_mat); |
| numFeatures_plus_one = numFeatures + 1; |
| w = Rand (rows=numFeatures_plus_one, cols=1, min=-1, max=1, pdf="uniform", seed=0); |
| } |
| |
| [w, new_sigmaLF] = scaleWeights (X, w, meanLF, sigmaLF); |
| if (sigmaLF != new_sigmaLF) { |
| print ("The standard deviation requirement on the linear form is TOO TIGHT!"); |
| print ("We relaxed sigmaLF from " + sigmaLF + " to " + new_sigmaLF + "."); |
| } |
| ot = X %*% w; |
| |
| if (b != 0) { |
| X = X [, 1:numFeatures]; |
| } |
| |
| emp_meanLF = sum (ot) / numSamples; |
| emp_sigmaLF = sqrt (sum (ot * ot) / numSamples - emp_meanLF * emp_meanLF); |
| print ("Empirical meanLF = " + emp_meanLF + "; Empirical sigmaLF = " + emp_sigmaLF); |
| |
| prob = 1 / (1 + exp (- ot)); |
| |
| if(addNoise == 1){ |
| r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) |
| }else{ |
| print("this data generator generates the same dataset for both noise=0 and noise=1") |
| r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) |
| #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform") |
| } |
| |
| print ("nrow(prob) = " + nrow(prob) + ", ncol(prob) = " + ncol(prob) + "; nrow(r) = " + nrow(r) + ", ncol(r) = " + ncol(r)); |
| |
| Y = 1 - 2*(prob < r) |
| |
| write (w, $5, format="text"); |
| write (X, $6, format="binary"); |
| write (Y, $7, format="binary"); |
| |
| |
| # Shifts and scales the weights to ensure the desired statistics for Linear Form = w^T X |
| # Used in data and/or weight generation in the testing of GLM, Logistic Regression etc. |
| # new_sigmaLF == sigmaLF if successful, new_sigmaLF > sigmaLF if had to relax this constraint |
| scaleWeights = |
| function (Matrix[double] X_data, Matrix[double] w_unscaled, double meanLF, double sigmaLF) |
| return (Matrix[double] w_scaled, double new_sigmaLF) |
| { |
| numFeatures = nrow (w_unscaled); |
| W_ext = Rand (rows = numFeatures, cols = 2, min = 1, max = 1); |
| W_ext [, 1] = w_unscaled; |
| S1 = colSums (X_data %*% W_ext); |
| TF = Rand (rows = 2, cols = 2, min = 1, max = 1); |
| TF [1, 1] = S1 [1, 1] * meanLF * nrow (X_data) / as.scalar (S1 %*% t(S1)); |
| TF [1, 2] = S1 [1, 2]; |
| TF [2, 1] = S1 [1, 2] * meanLF * nrow (X_data) / as.scalar (S1 %*% t(S1)); |
| TF [2, 2] = - S1 [1, 1]; |
| TF = W_ext %*% TF; |
| Q = t(TF) %*% t(X_data) %*% X_data %*% TF; |
| Q [1, 1] = Q [1, 1] - nrow (X_data) * meanLF * meanLF; |
| new_sigmaLF = sigmaLF; |
| discr = as.scalar (Q [1, 1] * Q [2, 2] - Q [1, 2] * Q [2, 1] - nrow (X_data) * Q [2, 2] * sigmaLF * sigmaLF); |
| if (discr > 0.0) { |
| new_sigmaLF = sqrt (as.scalar ((Q [1, 1] * Q [2, 2] - Q [1, 2] * Q [2, 1]) / (nrow (X_data) * Q [2, 2]))); |
| discr = -0.0; |
| } |
| t = Rand (rows = 2, cols = 1, min = 1, max = 1); |
| t [2, 1] = (- Q [1, 2] + sqrt (- discr)) / Q [2, 2]; |
| w_scaled = TF %*% t; |
| } |
| |
| |