| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| /* |
| ------------------------------------------------ |
| Parameters |
| ------------------------------------------------ |
| $R = #rows |
| $C = #columns |
| $NC = number of categorical attributes |
| $MAXDOMAIN = maximum domain size |
| $DATA = output file path on HDFS |
| $SETSIZE = Size of one bivariate set |
| $LABELSETSIZE= Size of second bivariate set with labels |
| $TYPES = output attribute types |
| $TYPES1 = Attribute types for Set1 |
| $TYPES2 = Attribute types for Set2 |
| $INDEX1 = Indices for Set1 |
| $INDEX2 = Indices for Set2 |
| $FMT = output format |
| ------------------------------------------------ |
| hadoop jar SystemDS.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50 MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10 TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv |
| ------------------------------------------------ |
| */ |
| |
| |
| FMT = ifdef($FMT,"binary"); # default output format |
| |
| # number of categorical attributes.. numC <= C |
| numC = $NC; |
| numO = as.integer(numC/2); |
| numNominal = numC - numO; |
| print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")"); |
| |
| # maximum domain size among all categorical attributes |
| maxDomainSize = $MAXDOMAIN; |
| |
| # Divide $C attributes according to the following logic: |
| # |
| # 1 numO numC C |
| # |-------|---------|-----------------| |
| # ord nominal scale |
| # |
| # numC+1-$C: scale |
| # 1-numC/2: ordinal |
| # (numC/2+1)-numC: nominal |
| |
| types = matrix(1, rows=1, cols=$C); |
| ocutoff = numO; |
| types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3; |
| types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2; |
| |
| # Generate data |
| A = rand(rows=$R, cols=$C, sparsity=1); |
| B = matrix(0,rows=nrow(A), cols=ncol(A)); |
| parfor (i in 1:numC) { |
| Ai = A[,i]; |
| |
| tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize)); |
| domain = as.scalar(tmp[1,1]); |
| |
| # for some attributes, choose the maxDomainSize |
| tmp = rand(rows=1,cols=1); |
| if (as.scalar(tmp[1,1]) < 0.5) { |
| domain = maxDomainSize; |
| } |
| |
| B[,i] = round(1+(domain-1)*Ai); |
| } |
| B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)]; |
| |
| |
| write(B, $DATA, format=FMT); |
| write(types, $TYPES, format=FMT); |
| |
| # ----- Generator for Bivariate --------- |
| |
| settypes1 = matrix(1, rows=1, cols=$SETSIZE); |
| index1 = matrix(0, rows=1, cols=$SETSIZE); |
| |
| catSetSize = as.integer($SETSIZE/2); |
| ocutoff = as.integer(catSetSize/2); |
| print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" ); |
| settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3; |
| settypes1[1, ocutoff+1:catSetSize] = matrix(1,rows=1,cols=(catSetSize-ocutoff))*2; |
| |
| # select ordinal indices |
| tmp = rand(rows=1, cols=ocutoff); |
| index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp); |
| |
| # select nominal indices |
| nominalSetSize = catSetSize-ocutoff; |
| tmp = rand(rows=1, cols=nominalSetSize); |
| index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp); |
| |
| # select scale attributes |
| scaleSetSize = $SETSIZE-catSetSize; |
| tmp = rand(rows=1, cols=scaleSetSize); |
| index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp); |
| |
| |
| # --- select types and indices for LABELSET |
| settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE); |
| index2 = matrix(0, rows=1, cols=$LABELSETSIZE); |
| if($LABELSETSIZE > 1) { |
| settypes2[1,1] = 1; |
| r = as.scalar(rand(rows=1,cols=1)); |
| index2[1,1] = round(numC+1 + ($C-numC-1)*r) |
| } |
| else { |
| r = as.scalar(rand(rows=1,cols=1)); |
| index2[1,1] = round( numO+1 + (numC-numO-1)*r ) |
| } |
| |
| for(i in 2:as.integer($LABELSETSIZE/2)) { |
| settypes2[1,i] = 3; |
| r = as.scalar(rand(rows=1,cols=1)); |
| index2[1,i] = round( 1 + (numO-1)*r ) |
| } |
| |
| for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) { |
| settypes2[1,i] = 2; |
| r = as.scalar(rand(rows=1,cols=1)); |
| index2[1,i] = round( numO+1 + (numC-numO-1)*r ) |
| } |
| |
| write(settypes1, $TYPES1, format=FMT); |
| write(settypes2, $TYPES2, format=FMT); |
| write(index1, $INDEX1, format=FMT); |
| write(index2, $INDEX2, format=FMT); |
| |