blob: 5def92430e322c96935f5c22b70e465ac5a213b6 [file] [log] [blame]
#!/bin/bash
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi
if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemML.jar " ; else CMD="echo " ; fi
BASE=$1/trees
FORMAT="csv"
DENSE_SP=0.9
SPARSE_SP=0.01
PATH_LOCAL=/tmp/datagen
PATH_HDFS=$BASE
#### part 1: generating class labels and categorical features
${CMD} -f ../datagen/genRandData4DecisionTree1.dml $DASH-nvargs XCat=$BASE/XCat Y=$BASE/Y num_records=1000 num_cat=100 num_class=10 num_distinct=100 sp=$DENSE_SP
#### part 2: generating spec.json on HDFS
NUM_FEATURES=100
echo "{ \"ids\": true
,\"recode\": [1 " > $PATH_LOCAL/spec.json
for i in $(seq 2 $NUM_FEATURES); do
echo " , "$i >> $PATH_LOCAL/spec.json
done
echo " ] , \"dummycode\": [ 1" >> $PATH_LOCAL/spec.json
for i in $(seq 2 $NUM_FEATURES); do
echo " , "$i >> $PATH_LOCAL/spec.json
done
echo "] }" >> $PATH_LOCAL/spec.json
hadoop fs -rm $PATH_HDFS/spec.json
hadoop fs -copyFromLocal $PATH_LOCAL/spec.json $PATH_HDFS/spec.json
#### part 3: generating scale feature and transforming categorical features, finally combaning scale and categorical features
${CMD} -f ../datagen/genRandData4DecisionTree2.dml $DASH-nvargs tPath=$BASE/metadata tSpec=$BASE/spec.json XCat=$BASE/XCat X=$BASE/X num_records=1000 num_scale=100 sp=$DENSE_SP fmt=$FORMAT