blob: 29fd0d7a1762a78e011b5a7670e38fa34f239757 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.impala.datagenerator;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.TimeZone;
class TestDataGenerator {
// 2 years
private static final int DEFAULT_NUM_PARTITIONS = 24;
// 10 tuples per day of month
private static final int DEFAULT_MAX_TUPLES_PER_PARTITION = 310;
// arbitrary default value
private static final int DEFAULT_END_YEAR = 2010;
// for generating unique ids.
private static int id = 0;
private static void GenerateAllTypesData(String dir, int numPartitions,
int maxTuplesPerPartition) throws IOException {
id = 0;
int numYears = Math.max((numPartitions / 12) - 1, 1);
int startYear = Math.max(DEFAULT_END_YEAR - numYears, 0);
GregorianCalendar date = new GregorianCalendar(startYear, Calendar.JANUARY, 1);
GregorianCalendar endDate = new GregorianCalendar(DEFAULT_END_YEAR, Calendar.DECEMBER, 31);
int months = 0;
while (date.before(endDate) && months < numPartitions) {
GregorianCalendar nextMonth = (GregorianCalendar) date.clone();
nextMonth.add(Calendar.MONTH, 1);
GenerateAllTypesPartition(dir, date, nextMonth, 10, maxTuplesPerPartition, false);
date = nextMonth;
private static void GenerateAllTypesAggData(String dir, boolean writeNulls)
throws IOException {
id = 0;
int startYear = 2010;
GregorianCalendar date = new GregorianCalendar(startYear, Calendar.JANUARY, 1);
GregorianCalendar endDate = (GregorianCalendar) date.clone();
endDate.add(Calendar.DAY_OF_MONTH, 10);
while (date.before(endDate)) {
GregorianCalendar nextDay = (GregorianCalendar) date.clone();
nextDay.add(Calendar.DAY_OF_MONTH, 1);
GenerateAllTypesPartition(dir, date, nextDay, 1000, 1000, writeNulls);
date = nextDay;
private static void GenerateAllTypesPartition(String dir, Calendar startDate,
Calendar endDate, int intsPerDay, int maxTuplesPerPartition, boolean writeNulls)
throws IOException {
SimpleDateFormat filenameFormat = new SimpleDateFormat("yyMMdd");
PrintWriter writer = new PrintWriter(new FileWriter(new File(new File(dir),
filenameFormat.format(startDate.getTime()) + ".txt")));
Calendar date = (Calendar) startDate.clone();
SimpleDateFormat df = new SimpleDateFormat("MM/dd/yy");
SimpleDateFormat tsf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");
int count = 0;
while (date.before(endDate) && count < maxTuplesPerPartition) {
for (int int_col = 0; int_col < intsPerDay && count < maxTuplesPerPartition;
++int_col) {
boolean bool_col = (id % 2 == 0 ? true : false);
byte tinyint_col = (byte) (int_col % 10);
short smallint_col = (short) (int_col % 100);
long bigint_col = int_col * 10;
float float_col = (float) (1.1 * int_col);
double double_col = 10.1 * int_col;
String date_string_col = df.format(date.getTime());
String string_col = String.valueOf(int_col);
String timestamp_col = tsf.format(date.getTime());
writer.format("%d,%b,%s,%s,%s,%s,", id, bool_col,
(writeNulls && tinyint_col == 0 ? "" : Byte.toString(tinyint_col)),
(writeNulls && smallint_col == 0 ? "" : Short.toString(smallint_col)),
(writeNulls && int_col == 0 ? "" : Integer.toString(int_col)),
(writeNulls && bigint_col == 0 ? "" : Long.toString(bigint_col)));
(writeNulls && int_col == 0 ? "" : Float.toString(float_col)),
(writeNulls && int_col == 0 ? "" : Double.toString(double_col)),
date_string_col, string_col, timestamp_col);
date.add(Calendar.MINUTE, 1);
date.add(Calendar.MILLISECOND, (int)bigint_col);
date.add(Calendar.DAY_OF_MONTH, 1);
// Generate cols with schema: DECIMAL(10, 4) and DECIMAL(15, 5), DECIMAL(1,1)
private static void GenerateDecimalData(String dir, int numRows) throws IOException {
PrintWriter writer = new PrintWriter(new FileWriter(new File(new File(dir),
double col1 = 0;
double col2 = 100;
double col3 = 0;
double col1Delta = 0.1111;
double col2Delta = 1.22222;
double col3Delta = 0.1;
for (int i = 0; i < numRows; ++i) {
if (i % 10 == 0) col3 = 0;
String s1 =
BigDecimal.valueOf(col1).setScale(4, BigDecimal.ROUND_HALF_UP).toString();
String s2 =
BigDecimal.valueOf(col2).setScale(5, BigDecimal.ROUND_HALF_UP).toString();
String s3 =
BigDecimal.valueOf(col3).setScale(1, BigDecimal.ROUND_HALF_UP).toString();
writer.format("%s,%s,%s\n", s1, s2,s3);
col1 += col1Delta;
col2 += col2Delta;
col3 += col3Delta;
* Generate some test data.
* @param BaseOutputDirectory
* : Required base output folder of generated data files.
* @throws Exception
* something bad happened
public static void main(String args[]) throws Exception {
if (args.length != 1) {
System.err.println("Usage: " + "TestDataGenerator BaseOutputDirectory");
// The TimeZone should be the same no matter what the TimeZone is of the computer
// running this code, in order to ensure the generated data is always the same.
// Generate AllTypes
String dirName = args[0] + "/AllTypes";
File dir = new File(dirName);
GenerateAllTypesData(dirName, DEFAULT_NUM_PARTITIONS,
// Generate AllTypesSmall
dirName = args[0] + "/AllTypesSmall";
dir = new File(dirName);
GenerateAllTypesData(dirName, 4, 25);
// Generate AllTypesSmall
dirName = args[0] + "/AllTypesTiny";
dir = new File(dirName);
GenerateAllTypesData(dirName, 4, 2);
// Generate AllTypesAgg
dirName = args[0] + "/AllTypesAgg";
dir = new File(dirName);
GenerateAllTypesAggData(dirName, true);
// Generate AllTypesAgg w/o nulls
dirName = args[0] + "/AllTypesAggNoNulls";
dir = new File(dirName);
GenerateAllTypesAggData(dirName, false);
// Generate Decimal data
dirName = args[0] + "/DecimalTiny";
dir = new File(dirName);
GenerateDecimalData(dirName, 100);