blob: e2463402a74f15bc8c2e0c784cf8fb358cf0a065 [file] [log] [blame]
#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# A utility to generate test data for pig test harness tests.
#
#
use strict;
use charnames ();
use Cwd;
use IPC::Run qw(run);
our @firstName = ("alice", "bob", "calvin", "david", "ethan", "fred",
"gabriella", "holly", "irene", "jessica", "katie", "luke", "mike", "nick",
"oscar", "priscilla", "quinn", "rachel", "sarah", "tom", "ulysses", "victor",
"wendy", "xavier", "yuri", "zach");
our @lastName = ("allen", "brown", "carson", "davidson", "ellison", "falkner",
"garcia", "hernandez", "ichabod", "johnson", "king", "laertes", "miller",
"nixon", "ovid", "polk", "quirinius", "robinson", "steinbeck", "thompson",
"underhill", "van buren", "white", "xylophone", "young", "zipper");
sub randomName()
{
return sprintf("%s %s", $firstName[int(rand(26))],
$lastName[int(rand(26))]);
}
our @city = ("albuquerque", "bombay", "calcutta", "danville", "eugene",
"frankfurt", "grenoble", "harrisburg", "indianapolis",
"jerusalem", "kellogg", "lisbon", "marseilles",
"nice", "oklohoma city", "paris", "queensville", "roswell",
"san francisco", "twin falls", "umatilla", "vancouver", "wheaton",
"xacky", "youngs town", "zippy");
sub randomCity()
{
return $city[int(rand(26))];
}
our @state = ( "AL", "AK", "AS", "AZ", "AR", "CA", "CO", "CT", "DE", "DC",
"FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
"ND", "OH", "OK", "OR", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA",
"WA", "WV", "WI", "WY");
sub randomState()
{
return $state[int(rand(50))];
}
our @classname = ("american history", "biology", "chemistry", "debate",
"education", "forestry", "geology", "history", "industrial engineering",
"joggying", "kindergarten", "linguistics", "mathematics", "nap time",
"opthamology", "philosophy", "quiet hour", "religion", "study skills",
"topology", "undecided", "values clariffication", "wind surfing",
"xylophone band", "yard duty", "zync studies");
sub randomClass()
{
return $classname[int(rand(26))];
}
our @grade = ("A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-",
"F");
sub randomGrade()
{
return $grade[int(rand(int(@grade)))];
}
our @registration = ("democrat", "green", "independent", "libertarian",
"republican", "socialist");
sub randomRegistration()
{
return $registration[int(rand(int(@registration)))];
}
sub randomAge()
{
return (int(rand(60)) + 18);
}
sub randomGpa()
{
return rand(4.0);
}
our @street = ("A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",
"T", "U", "V", "W", "X", "Y", "Z");
sub randomStreet()
{
return sprintf("%d %s st", int(rand(1000)), $street[int(rand(26))]);
}
sub randomZip()
{
return int(rand(100000));
}
sub randomContribution()
{
return sprintf("%.2f", rand(1000));
}
our @numLetter = ("1", "09", "09a");
sub randomNumLetter()
{
return $numLetter[int(rand(int(@numLetter)))];
}
our @greekLetter = ( "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
"eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron",
"pi", "rho", "sigma", "tau", "upsilon", "chi", "phi", "psi", "omega" );
sub randomGreekLetter()
{
return $greekLetter[int(rand(int(@greekLetter)))];
}
sub randomNameAgeGpaMap()
{
my $size = int(rand(3));
my @mapValues = ( "name#" . randomName(), "age#" . randomAge(), "gpa#" . randomGpa() );
$size = ($size == 0 ? 1 : $size);
my $map;
for(my $i = 0; $i <= $size; $i++) {
$map .= $mapValues[$i];
if($i != $size) {
$map .= ",";
}
}
return $map;
}
sub getMapFields($) {
my $mapString = shift;
# remove the enclosing square brackets
$mapString =~ s/[\[\]]//g;
# get individual map fields
my @fields = split(/,/, $mapString);
# get only the values
my $hash;
for my $field (@fields) {
if($field =~ /(\S+)#(.*)/) {
$hash->{$1} = $2;
}
}
return $hash;
}
sub randomNameAgeGpaTuple()
{
my $gpa = sprintf("%0.2f", randomGpa());
return randomName() . "," . randomAge() . "," . $gpa ;
}
sub randomList()
{
my $size = int(rand(int(3))) + 1;
my $bag;
for(my $i = 0; $i <= $size; $i++) {
$bag .= randomAge();
$bag .= "," if ($i != $size);
}
return $bag;
}
sub randomEscape()
{
my $r = rand(1);
if ($r < 0.16) {
return '\"';
} elsif ($r < 0.32) {
return '\\\\';
} elsif ($r < 0.48) {
return '\/';
} elsif ($r < 0.64) {
return '\n';
} elsif ($r < 0.80) {
return '\t';
} else {
return randomUnicodeHex();
}
}
sub randomJsonString()
{
my $r = rand(1);
#if ($r < 0.05) {
# return "null";
#} elsif ($r < 0.10) {
# return randomName() . randomEscape() . randomName();
#} else {
return randomName();
#}
}
sub randomNullBoolean()
{
my $r = rand(1);
if ($r < 0.05) {
return 'null';
} elsif ($r < 0.525) {
return 'true';
} else {
return 'false';
}
}
sub randomJsonMap()
{
if (rand(1) < 0.05) {
return 'null';
}
my $str = "{";
my $num = rand(5) + 1;
for (my $i = 0; $i < $num; $i++) {
$str .= "," unless $i == 0;
$str .= '"' . randomCity() . '" : "' . randomName() . '"';
}
$str .= "}";
return $str;
}
sub randomJsonBag()
{
if (rand(1) < 0.05) {
return 'null';
}
my $str = "[";
my $num = rand(5) + 1;
for (my $i = 0; $i < $num; $i++) {
$str .= "," unless $i == 0;
$str .= '{"a":' . int(rand(2**32) - 2**31) . ',"b":"' .
randomJsonString() . '"}';
}
$str .= "]";
}
sub usage()
{
warn "Usage: $0 filetype numrows tablename hdfstargetdir [format]\n";
warn "\tValid filetypes [studenttab, studentparttab, \n";
warn "\t\tstudentnull, allscalars, studentcomplextab, \n";
warn "\t\tvoternulltab votertab, unicode]\n";
warn "hdfstargetdir is the directory in hdfs that data will be copied to for loading into tables\n";
warn "format is one of rc, csv, or json. csv is the default";
}
our @greekUnicode = ("\N{U+03b1}", "\N{U+03b2}", "\N{U+03b3}", "\N{U+03b4}",
"\N{U+03b5}", "\N{U+03b6}", "\N{U+03b7}", "\N{U+03b8}", "\N{U+03b9}",
"\N{U+03ba}", "\N{U+03bb}", "\N{U+03bc}", "\N{U+03bd}", "\N{U+03be}",
"\N{U+03bf}", "\N{U+03c0}", "\N{U+03c1}", "\N{U+03c2}", "\N{U+03c3}",
"\N{U+03c4}", "\N{U+03c5}", "\N{U+03c6}", "\N{U+03c7}", "\N{U+03c8}",
"\N{U+03c9}");
sub randomUnicodeNonAscii()
{
my $name = $firstName[int(rand(int(@firstName)))] .
$greekUnicode[int(rand(int(@greekUnicode)))];
return $name;
}
sub randomUnicodeHex()
{
return sprintf "\\u%04x", 0x3b1 + int(rand(25));
}
my $testvar = "\N{U+03b1}\N{U+03b3}\N{U+03b1}\N{U+03c0}\N{U+03b7}";
sub getBulkCopyCmd($$;$)
{
my ($tableName, $delimeter, $filename) = @_;
$filename = $tableName if (!defined($filename));
return "load data local infile '" . cwd . "/$filename'
into table $tableName
columns terminated by '$delimeter';"
}
sub generateSecondHalfCreateTable($$$;$$$)
{
my ($hivefp, $format, $location, $fieldDelim, $structDelim, $mapDelim) = @_;
if ($format eq "csv") {
print $hivefp "
row format delimited
fields terminated by '$fieldDelim'
stored as textfile
location '$location';\n";
} elsif ($format eq "rc") {
print $hivefp "
stored as rcfile
location '$location';\n";
} elsif ($format eq "json") {
print $hivefp "
row format serde 'org.apache.hcatalog.data.JsonSerDe'
stored as textfile
location '$location'
;\n";
} else {
die "Unknown format $format\n";
}
}
sub findAllJars()
{
my @files = <../../../../../core/build/lib/test/*.jar>;
my $classpath = "";
my $file = undef;
foreach $file (@files) {
$classpath = $classpath . ":" . $file;
}
return $classpath;
}
sub getJavaCmd()
{
if (defined $ENV{'JAVA_HOME'}) {
return "$ENV{'JAVA_HOME'}/bin/java";
} else {
my $java = `which java`;
if ($?) {
die "Unable to find java executable;"
} else {
return $java;
}
}
}
# main
{
# explicitly call srand so we get the same data every time
# we generate it. However, we set it individually for each table type.
# Otherwise we'd be generating the same data sets regardless of size,
# and this would really skew our joins.
my $filetype = shift;
my $numRows = shift;
my $tableName = shift;
my $hdfsTargetDir= shift;
my $format = shift;
die usage() if (!defined($filetype) || !defined($numRows) || !defined($tableName) || !defined($hdfsTargetDir));
if ($numRows <= 0) { usage(); }
$format = "csv" if not defined $format;
if ($format eq "csv") {
open(HDFS, "> $tableName") or die("Cannot open file $tableName, $!\n");
}
open(MYSQL, "> $tableName.mysql.sql") or
die("Cannot open file $tableName.mysql.sql, $!\n");
open(my $hivefp, "> $tableName.hcat.sql") or
die("Cannot open file $tableName.hive.sql, $!\n");
if ($filetype eq "studenttab") {
srand(3.14159 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n";
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
name string,
age int,
gpa double)";
generateSecondHalfCreateTable($hivefp, $format,
"$hdfsTargetDir/$tableName", '\\t');
if ($format eq "csv") {
print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName");
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $gpa = randomGpa();
printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa;
}
} elsif ($format eq "rc") {
print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName.plain");
my $allJars = findAllJars();
my @cmd = (getJavaCmd(), '-cp',
"../tools/generate/java/hive-gen.jar:$allJars",
'org.apache.hadoop.hive.tools.generate.RCFileGenerator',
'student', $numRows, "$tableName", "$tableName.plain");
run(\@cmd) or die "Unable to run command [" . join(" ", @cmd)
. "]\n";
#@cmd = ('java', '-cp',
# "$hiveCliJar:$hiveExecJar:$hadoopCoreJar:" .
# "$commonsCliJar:$commonsConfigJar",
# "org.apache.hadoop.hive.cli.RCFileCat", "$tableName");
#run(\@cmd, '>', $tableName) or
# die "Unable to run command [" . join(" ", @cmd) . "]\n";
} else {
die "Unknown format $format\n";
}
} elsif ($filetype eq "studentparttab") {
srand(3.14159 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (name varchar(100), age integer, gpa float(3), ds char(8));\n";
print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName.mysql");
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
name string,
age int,
gpa double)
partitioned by (ds string)
row format delimited
fields terminated by '\\t'
stored as textfile
location '$hdfsTargetDir/$tableName';
alter table $tableName add IF NOT EXISTS partition (ds='20110924') location '$hdfsTargetDir/$tableName/$tableName.20110924';
alter table $tableName add IF NOT EXISTS partition (ds='20110925') location '$hdfsTargetDir/$tableName/$tableName.20110925';
alter table $tableName add IF NOT EXISTS partition (ds='20110926') location '$hdfsTargetDir/$tableName/$tableName.20110926';
";
open(MYSQLDATA, "> $tableName.mysql") or die("Cannot open file $tableName.mysql, $!\n");
for (my $ds = 20110924; $ds < 20110927; $ds++) {
close(HDFS);
open(HDFS, "> $tableName.$ds") or die("Cannot open file $tableName.$ds, $!\n");
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $gpa = randomGpa();
printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa;
printf MYSQLDATA "%s\t%d\t%.3f\t%d\n", $name, $age, $gpa, $ds;
}
}
close(MYSQLDATA);
} elsif ($filetype eq "studentnull") {
srand(3.14159 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n";
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
name string,
age int,
gpa double)
row format delimited
fields terminated by '\\001'
stored as textfile
location '$hdfsTargetDir/$tableName';\n";
for (my $i = 0; $i < $numRows; $i++) {
# generate nulls in a random fashion
my $name = rand(1) < 0.05 ? '' : randomName();
my $age = rand(1) < 0.05 ? '' : randomAge();
my $gpa = rand(1) < 0.05 ? '' : randomGpa();
printf MYSQL "insert into $tableName (name, age, gpa) values(";
print MYSQL ($name eq ''? "null, " : "'$name', "), ($age eq ''? "null, " : "$age, ");
if($gpa eq '') {
print MYSQL "null);\n"
} else {
printf MYSQL "%.2f);\n", $gpa;
}
print HDFS "$name$age";
if($gpa eq '') {
print HDFS "\n"
} else {
printf HDFS "%.2f\n", $gpa;
}
}
print MYSQL "commit;\n";
} elsif ($filetype eq "allscalars") {
srand(2.718281828459 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (t tinyint, si smallint, i int, b
bigint, f double, d double, s varchar(25));\n";
print MYSQL &getBulkCopyCmd($tableName, ':');
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
t tinyint,
si smallint,
i int,
b bigint,
f float,
d double,
s string)
row format delimited
fields terminated by ':'
stored as textfile
location '$hdfsTargetDir/$tableName';\n";
for (my $i = 0; $i < $numRows; $i++) {
printf HDFS "%d:%d:%d:%ld:%.2f:%.2f:%s\n",
(int(rand(2**8) - 2**7)),
(int(rand(2**16) - 2**15)),
(int(rand(2**32) - 2**31)),
(int(rand(2**64) - 2**61)),
rand(100000.0) - 50000.0,
rand(10000000.0) - 5000000.0,
randomName();
}
} elsif ($filetype eq "studentcomplextab") {
srand(3.14159 + $numRows);
my $mapTable = $tableName . "_map";
my $listTable = $tableName . "_list";
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "drop table if exists $mapTable;\n";
print MYSQL "drop table if exists $listTable;\n";
print MYSQL "create table $tableName (id integer, name varchar(100), age integer, gpa float(3));\n";
print MYSQL "create table $mapTable (tid integer, mkey varchar(100), mvalue varchar(100));\n";
print MYSQL "create table $listTable (tid integer, lvalue integer);\n";
print MYSQL "begin;\n";
print $hivefp "drop table if exists $tableName;
create external table $tableName(
m map<string, string>,
s struct <name: string, age: int, gpa: float>,
a array <int>)
row format delimited
fields terminated by '\\t'
collection items terminated by ','
map keys terminated by '#'
stored as textfile
location '$hdfsTargetDir/$tableName';\n";
for (my $i = 0; $i < $numRows; $i++) {
# generate nulls in a random fashion
my $map = rand(1) < 0.05 ? '' : randomNameAgeGpaMap();
my $tuple = rand(1) < 0.05 ? '' : randomNameAgeGpaTuple();
my $list = rand(1) < 0.05 ? '' : randomList();
print MYSQL "insert into $tableName (id, name, age, gpa) values(";
print MYSQL "$i, ";
if ($tuple eq '') {
print MYSQL "null, null, null";
} else {
my @t = split(',', $tuple);
print MYSQL "'$t[0]', $t[1], $t[2]";
}
print MYSQL ");\n";
if ($map ne '') {
my $mapHash = getMapFields($map);
foreach my $k (keys(%$mapHash)) {
print MYSQL "insert into $mapTable (tid, mkey, mvalue) values($i, '$k', '$mapHash->{$k}');\n";
}
}
if ($list ne '') {
my @ls = split(',', $list);
foreach my $e (@ls) {
print MYSQL "insert into $listTable (tid, lvalue) values($i, $e);\n";
}
}
print HDFS "$map\t$tuple\t$list\n";
}
print MYSQL "commit;\n";
} elsif ($filetype eq "votertab") {
srand(299792458 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n";
print MYSQL &getBulkCopyCmd($tableName, "\t");
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
name string,
age int,
registration string,
contributions float)
row format delimited
fields terminated by '\\t'
stored as textfile
location '$hdfsTargetDir/$tableName';\n";
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $registration = randomRegistration();
my $contribution = randomContribution();
printf HDFS "%s\t%d\t%s\t%.2f\n", $name, $age,
$registration, $contribution;
}
} elsif ($filetype eq "voternulltab") {
srand(299792458 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n";
print MYSQL "begin transaction;\n";
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
name string,
age int,
registration string,
contributions float)
row format delimited
fields terminated by '\\t'
stored as textfile
location '$hdfsTargetDir/$tableName';\n";
for (my $i = 0; $i < $numRows; $i++) {
# generate nulls in a random fashion
my $name = rand(1) < 0.05 ? '' : randomName();
my $age = rand(1) < 0.05 ? '' : randomAge();
my $registration = rand(1) < 0.05 ? '' : randomRegistration();
my $contribution = rand(1) < 0.05 ? '' : randomContribution();
printf MYSQL "insert into $tableName (name, age, registration, contributions) values(";
print MYSQL ($name eq ''? "null, " : "'$name', "),
($age eq ''? "null, " : "$age, "),
($registration eq ''? "null, " : "'$registration', ");
if($contribution eq '') {
print MYSQL "null);\n"
} else {
printf MYSQL "%.2f);\n", $contribution;
}
print HDFS "$name\t$age\t$registration\t";
if($contribution eq '') {
print HDFS "\n"
} else {
printf HDFS "%.2f\n", $contribution;
}
}
print MYSQL "commit;\n";
} elsif ($filetype eq "unicode") {
srand(1.41421 + $numRows);
print MYSQL "drop table if exists $tableName;\n";
print MYSQL "create table $tableName (name varchar(255));\n";
print MYSQL "begin;\n";
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
name string)
row format delimited
fields terminated by '\\t'
stored as textfile
location '$hdfsTargetDir/$tableName';\n";
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomUnicodeNonAscii();
printf MYSQL "insert into $tableName (name) values('%s');\n", $name;
printf HDFS "%s\n", $name;
}
print MYSQL "commit;\n";
} elsif ($filetype eq "json") {
srand(6.0221415 + $numRows);
print MYSQL "drop table if exists $tableName;";
print MYSQL "create table $tableName(
s varchar(100),
i int,
d double);";
print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName.plain");
print $hivefp "drop table if exists $tableName;\ncreate external table $tableName(
s string,
i int,
d double,
m map<string, string>,
bb array<struct<a: int, b: string>>)
row format serde 'org.apache.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
location '$hdfsTargetDir/$tableName';\n";
open(PLAIN, ">$tableName.plain") or
die("Cannot open file $tableName.hive.sql, $!\n");
for (my $i = 0; $i < $numRows; $i++) {
my $s = randomJsonString();
my $i = int(rand(2**32) - 2**31),
my $d = rand(2**10) - 2**9,
# my $i = rand(1) < 0.05 ? 'null' : (int(rand(2**32) - 2**31)),
# my $d = rand(1) < 0.05 ? 'null' : (rand(2**10) - 2**9),
my $m = randomJsonMap();
my $bb = randomJsonBag();
# printf MYSQL "insert into $tableName (name) values('%s');\n", $name;
print HDFS qq@{"s":"$s", "i":$i, "d":$d, "m":$m, "bb":$bb}\n@;
if ($s eq 'null') {
$s="";
}
print PLAIN "$s\t$i\t$d\n";
}
close PLAIN;
print MYSQL "commit;\n";
} else {
warn "Unknown filetype $filetype\n";
usage();
}
}