blob: 19ee843e3bb618fc7026f1e6c46e543ac4703bd3 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
------------------------------------------------
Parameters
------------------------------------------------
$R = #rows
$C = #columns
$NC = number of categorical attributes
$MAXDOMAIN = maximum domain size
$DATA = output file path on HDFS
$SETSIZE = Size of one bivariate set
$LABELSETSIZE= Size of second bivariate set with labels
$TYPES = output attribute types
$TYPES1 = Attribute types for Set1
$TYPES2 = Attribute types for Set2
$INDEX1 = Indices for Set1
$INDEX2 = Indices for Set2
$FMT = output format
------------------------------------------------
hadoop jar SystemML.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50 MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10 TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv
------------------------------------------------
*/
FMT = ifdef($FMT,"binary"); # default output format
# number of categorical attributes.. numC <= C
numC = $NC;
numO = as.integer(numC/2);
numNominal = numC - numO;
print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")");
# maximum domain size among all categorical attributes
maxDomainSize = $MAXDOMAIN;
# Divide $C attributes according to the following logic:
#
# 1 numO numC C
# |-------|---------|-----------------|
# ord nominal scale
#
# numC+1-$C: scale
# 1-numC/2: ordinal
# (numC/2+1)-numC: nominal
types = matrix(1, rows=1, cols=$C);
ocutoff = numO;
types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2;
# Generate data
A = rand(rows=$R, cols=$C, sparsity=1);
B = matrix(0,rows=nrow(A), cols=ncol(A));
parfor (i in 1:numC) {
Ai = A[,i];
tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize));
domain = as.scalar(tmp[1,1]);
# for some attributes, choose the maxDomainSize
tmp = rand(rows=1,cols=1);
if (as.scalar(tmp[1,1]) < 0.5) {
domain = maxDomainSize;
}
B[,i] = round(1+(domain-1)*Ai);
}
B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)];
write(B, $DATA, format=FMT);
write(types, $TYPES, format=FMT);
# ----- Generator for Bivariate ---------
settypes1 = matrix(1, rows=1, cols=$SETSIZE);
index1 = matrix(0, rows=1, cols=$SETSIZE);
catSetSize = as.integer($SETSIZE/2);
ocutoff = as.integer(catSetSize/2);
print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" );
settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
settypes1[1, ocutoff+1:catSetSize] = matrix(1,rows=1,cols=(catSetSize-ocutoff))*2;
# select ordinal indices
tmp = rand(rows=1, cols=ocutoff);
index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp);
# select nominal indices
nominalSetSize = catSetSize-ocutoff;
tmp = rand(rows=1, cols=nominalSetSize);
index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp);
# select scale attributes
scaleSetSize = $SETSIZE-catSetSize;
tmp = rand(rows=1, cols=scaleSetSize);
index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp);
# --- select types and indices for LABELSET
settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE);
index2 = matrix(0, rows=1, cols=$LABELSETSIZE);
if($LABELSETSIZE > 1) {
settypes2[1,1] = 1;
r = as.scalar(rand(rows=1,cols=1));
index2[1,1] = round(numC+1 + ($C-numC-1)*r)
}
else {
r = as.scalar(rand(rows=1,cols=1));
index2[1,1] = round( numO+1 + (numC-numO-1)*r )
}
for(i in 2:as.integer($LABELSETSIZE/2)) {
settypes2[1,i] = 3;
r = as.scalar(rand(rows=1,cols=1));
index2[1,i] = round( 1 + (numO-1)*r )
}
for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) {
settypes2[1,i] = 2;
r = as.scalar(rand(rows=1,cols=1));
index2[1,i] = round( numO+1 + (numC-numO-1)*r )
}
write(settypes1, $TYPES1, format=FMT);
write(settypes2, $TYPES2, format=FMT);
write(index1, $INDEX1, format=FMT);
write(index2, $INDEX2, format=FMT);