blob: 8eb31fa7c973d77664e873e0a5e35517b0f8ff84 [file] [log] [blame]
@echo off
@rem Licensed to the Apache Software Foundation (ASF) under one or more
@rem contributor license agreements. See the NOTICE file distributed with
@rem this work for additional information regarding copyright ownership.
@rem The ASF licenses this file to You under the Apache License, Version 2.0
@rem (the "License"); you may not use this file except in compliance with
@rem the License. You may obtain a copy of the License at
@rem
@rem http://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
SET GRID_MIX=%~dp0
CD %GRID_MIX%
CALL "%GRID_MIX%gridmix-env.cmd"
REM Smaller data set is used by default.
set COMPRESSED_DATA_BYTES=2147483648
set UNCOMPRESSED_DATA_BYTES=536870912
set INDIRECT_DATA_BYTES=58720256
REM Number of partitions for output data
if not defined NUM_MAPS (
set NUM_MAPS=100
)
set INDIRECT_DATA_FILES=200
REM If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset.
if defined USE_REAL_DATASET (
echo "Using real dataset"
REM 2TB data compressing to approx 500GB
set COMPRESSED_DATA_BYTES=2147483648000
REM 500GB
set UNCOMPRESSED_DATA_BYTES=536870912000
REM Default approx 70MB per data file, compressed
set INDIRECT_DATA_BYTES=58720256000
)
SET /a COMPRESSED_BYTES_PER_MAP=%COMPRESSED_DATA_BYTES% / %NUM_MAPS%
SET /a UNCOMPRESSED_BYTES_PER_MAP=%UNCOMPRESSED_DATA_BYTES% / %NUM_MAPS%
SET /a INDIRECT_BYTES_PER_MAP=%INDIRECT_DATA_BYTES% / %NUM_MAPS%
CALL "%HADOOP_HOME%/bin/hadoop" jar ^
%EXAMPLE_JAR% randomtextwriter ^
-D "test.randomtextwrite.total_bytes=%COMPRESSED_DATA_BYTES%" ^
-D "test.randomtextwrite.bytes_per_map=%COMPRESSED_BYTES_PER_MAP%" ^
-D "test.randomtextwrite.min_words_key=5" ^
-D "test.randomtextwrite.max_words_key=10" ^
-D "test.randomtextwrite.min_words_value=100" ^
-D "test.randomtextwrite.max_words_value=10000" ^
-D "mapred.output.compress=true" ^
-D "mapred.map.output.compression.type=BLOCK" ^
-outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat ^
%VARCOMPSEQ%
CALL "%HADOOP_HOME%/bin/hadoop" jar ^
%EXAMPLE_JAR% randomtextwriter ^
-D "test.randomtextwrite.total_bytes=%COMPRESSED_DATA_BYTES%" ^
-D "test.randomtextwrite.bytes_per_map=%COMPRESSED_BYTES_PER_MAP%" ^
-D "test.randomtextwrite.min_words_key=5" ^
-D "test.randomtextwrite.max_words_key=5" ^
-D "test.randomtextwrite.min_words_value=100" ^
-D "test.randomtextwrite.max_words_value=100" ^
-D "mapred.output.compress=true" ^
-D "mapred.map.output.compression.type=BLOCK" ^
-outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat ^
%FIXCOMPSEQ%
CALL "%HADOOP_HOME%\bin\hadoop" jar ^
%EXAMPLE_JAR% randomtextwriter ^
-D "test.randomtextwrite.total_bytes=%UNCOMPRESSED_DATA_BYTES%" ^
-D "test.randomtextwrite.bytes_per_map=%UNCOMPRESSED_BYTES_PER_MAP%" ^
-D "test.randomtextwrite.min_words_key=1" ^
-D "test.randomtextwrite.max_words_key=10" ^
-D "test.randomtextwrite.min_words_value=0" ^
-D "test.randomtextwrite.max_words_value=200" ^
-D "mapred.output.compress=false" ^
-outFormat org.apache.hadoop.mapred.TextOutputFormat ^
%VARINFLTEXT%
CALL "%HADOOP_HOME%\bin\hadoop" jar ^
%EXAMPLE_JAR% randomtextwriter ^
-D "test.randomtextwrite.total_bytes=%INDIRECT_DATA_BYTES%" ^
-D "test.randomtextwrite.bytes_per_map=%INDIRECT_BYTES_PER_MAP%" ^
-D "test.randomtextwrite.min_words_key=5" ^
-D "test.randomtextwrite.max_words_key=5" ^
-D "test.randomtextwrite.min_words_value=20" ^
-D "test.randomtextwrite.max_words_value=20" ^
-D "mapred.output.compress=true" ^
-D "mapred.map.output.compression.type=BLOCK" ^
-outFormat org.apache.hadoop.mapred.TextOutputFormat ^
%FIXCOMPTEXT%