blob: e3590ddc75b466c37961b77dfc2a10a8655ab2cc [file] [log] [blame]
#!/usr/bin/env perl
############################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A utility to generate test data for pig test harness tests.
#
#
use strict;
use charnames ();
our @firstName = ("alice", "bob", "calvin", "david", "ethan", "fred",
"gabriella", "holly", "irene", "jessica", "katie", "luke", "mike", "nick",
"oscar", "priscilla", "quinn", "rachel", "sarah", "tom", "ulysses", "victor",
"wendy", "xavier", "yuri", "zach");
our @lastName = ("allen", "brown", "carson", "davidson", "ellison", "falkner",
"garcia", "hernandez", "ichabod", "johnson", "king", "laertes", "miller",
"nixon", "ovid", "polk", "quirinius", "robinson", "steinbeck", "thompson",
"underhill", "van buren", "white", "xylophone", "young", "zipper");
############################################################################
# Explanation of rankedTuples: a pre-ranked set of tuples, each column meaning:
# rownumber: simple RANK, sequential number
# rankcabd: RANK BY c ASC , b DESC
# rankbdaa: RANK BY b DESC, a ASC
# rankbdca: RANK BY b DESC, c ASC
# rankaacd: RANK BY a ASC , c DESC
# rankaaba: RANK BY a ASC , b ASC
# a,b,c: values
# tail: long value in order to create multiple mappers
############################################################################
our @rankedTuples = (
"1,21,5,7,1,1,0,8,8","2,26,2,3,2,5,1,9,10","3,30,24,21,2,3,1,3,10","4,6,10,8,3,4,1,7,2",
"5,8,28,25,3,2,1,0,2","6,28,11,12,4,6,2,7,10","7,9,26,22,5,7,3,2,3","8,5,6,5,6,8,3,8,1",
"9,29,16,15,7,9,4,6,10","10,18,12,10,8,11,5,7,6","11,14,17,14,9,10,5,6,5","12,6,12,8,10,11,5,7,2",
"13,2,17,13,11,10,5,6,0","14,26,3,3,12,14,6,9,10","15,15,20,18,13,13,6,4,5","16,3,29,24,14,12,6,0,0",
"17,23,21,19,15,16,7,4,8","18,19,19,16,16,17,7,5,6","19,20,30,26,16,15,7,0,6","20,12,21,17,17,16,7,4,4",
"21,4,1,1,18,19,7,10,1","22,1,7,4,19,18,7,8,0","23,24,14,11,20,21,8,7,9","24,16,25,20,21,20,8,3,5",
"25,25,27,23,22,22,9,1,9","26,21,8,7,23,25,9,8,8","27,17,4,2,24,26,9,9,6","28,10,8,6,25,25,9,8,4",
"29,11,15,9,25,24,9,7,4","30,12,23,17,25,23,9,4,4");
sub randomName()
{
return sprintf("%s %s", $firstName[int(rand(26))],
$lastName[int(rand(26))]);
}
our @city = ("albuquerque", "bombay", "calcutta", "danville", "eugene",
"frankfurt", "grenoble", "harrisburg", "indianapolis",
"jerusalem", "kellogg", "lisbon", "marseilles",
"nice", "oklohoma city", "paris", "queensville", "roswell",
"san francisco", "twin falls", "umatilla", "vancouver", "wheaton",
"xacky", "youngs town", "zippy");
sub randomCity()
{
return $city[int(rand(26))];
}
our @state = ( "AL", "AK", "AS", "AZ", "AR", "CA", "CO", "CT", "DE", "DC",
"FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
"ND", "OH", "OK", "OR", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA",
"WA", "WV", "WI", "WY");
sub randomState()
{
return $state[int(rand(50))];
}
our @classname = ("american history", "biology", "chemistry", "debate",
"education", "forestry", "geology", "history", "industrial engineering",
"joggying", "kindergarten", "linguistics", "mathematics", "nap time",
"opthamology", "philosophy", "quiet hour", "religion", "study skills",
"topology", "undecided", "values clariffication", "wind surfing",
"xylophone band", "yard duty", "zync studies");
sub randomClass()
{
return $classname[int(rand(26))];
}
our @grade = ("A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-",
"F");
sub randomGrade()
{
return $grade[int(rand(int(@grade)))];
}
our @registration = ("democrat", "green", "independent", "libertarian",
"republican", "socialist");
sub randomRegistration()
{
return $registration[int(rand(int(@registration)))];
}
sub randomAge()
{
return (int(rand(60)) + 18);
}
sub randomGpa()
{
return rand(4.0);
}
our @street = ("A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S",
"T", "U", "V", "W", "X", "Y", "Z");
sub randomStreet()
{
return sprintf("%d %s st", int(rand(1000)), $street[int(rand(26))]);
}
sub randomZip()
{
return int(rand(100000));
}
sub randomContribution()
{
return sprintf("%.2f", rand(1000));
}
our @numLetter = ("1", "09", "09a");
sub randomNumLetter()
{
return $numLetter[int(rand(int(@numLetter)))];
}
our @greekLetter = ( "alpha", "beta", "gamma", "delta", "epsilon", "zeta",
"eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron",
"pi", "rho", "sigma", "tau", "upsilon", "chi", "phi", "psi", "omega" );
sub randomGreekLetter()
{
return $greekLetter[int(rand(int(@greekLetter)))];
}
sub randomNameAgeGpaMap()
{
my $size = int(rand(3));
my $map = "[";
my @mapValues = ( "name#" . randomName(), "age#" . randomAge(), "gpa#" . randomGpa() );
$size = ($size == 0 ? 1 : $size);
for(my $i = 0; $i <= $size; $i++) {
$map .= $mapValues[$i];
if($i != $size) {
$map .= ",";
}
}
$map .= "]";
return $map;
}
sub getMapFields($) {
my $mapString = shift;
# remove the enclosing square brackets
$mapString =~ s/[\[\]]//g;
# get individual map fields
my @fields = split(/,/, $mapString);
# get only the values
my $hash;
for my $field (@fields) {
if($field =~ /(\S+)#(.*)/) {
$hash->{$1} = $2;
}
}
return $hash;
}
sub randomNameAgeGpaTuple()
{
my $gpa = sprintf("%0.2f", randomGpa());
return "(" . randomName() . "," . randomAge() . "," . $gpa . ")" ;
}
sub randomNameAgeGpaBag()
{
my $size = int(rand(int(3)));
my $bag = "{";
$size = ($size == 0 ? 1 : $size);
for(my $i = 0; $i <= $size; $i++) {
$bag .= randomNameAgeGpaTuple();
if($i != $size) {
$bag .= ",";
}
}
$bag .= "}";
return $bag;
}
our @textDoc = (
"The cosmological proof, which we are now about to ex-",
"amine, retains the connection of absolute necessity with the",
"highest reality, but instead of reasoning, like the former proof,",
"from the highest reality to necessity of existence, it reasons",
"from the previously given unconditioned necessity of some",
"being to the unlimited reality of that being. It thus enters upon",
"a course of reasoning which, whether rational or only pseudo-",
"rational, is at any rate natural, and the most convincing not",
"only for common sense but even for speculative understand-",
"ing. It also sketches the first outline of all the proofs in natural",
"theology, an outline which has always been and always will",
"be followed, however much embellished and disguised by",
"superfluous additions. This proof, termed by Leibniz the proof",
"a contingentia mundi, we shall now proceed to expound and",
"examine.");
sub usage()
{
warn "Usage: $0 filetype numrows tablename targetdir [nosql]\n";
warn "\tValid filetypes [studenttab, studentcolon, \n";
warn "\t\tstudentnulltab, studentcomplextab, studentctrla, voternulltab\n";
warn "\t\tvotertab, reg1459894, textdoc, unicode, manual]\n";
}
our @greekUnicode = ("\N{U+03b1}", "\N{U+03b2}", "\N{U+03b3}", "\N{U+03b4}",
"\N{U+03b5}", "\N{U+03b6}", "\N{U+03b7}", "\N{U+03b8}", "\N{U+03b9}",
"\N{U+03ba}", "\N{U+03bb}", "\N{U+03bc}", "\N{U+03bd}", "\N{U+03be}",
"\N{U+03bf}", "\N{U+03c0}", "\N{U+03c1}", "\N{U+03c2}", "\N{U+03c3}",
"\N{U+03c4}", "\N{U+03c5}", "\N{U+03c6}", "\N{U+03c7}", "\N{U+03c8}",
"\N{U+03c9}");
sub randomUnicodeNonAscii()
{
my $name = $firstName[int(rand(int(@firstName)))] .
$greekUnicode[int(rand(int(@greekUnicode)))];
return $name;
}
my $testvar = "\N{U+03b1}\N{U+03b3}\N{U+03b1}\N{U+03c0}\N{U+03b7}";
our @utf8Name = ("佛","实","脑","体","宝","国","双","电","东","马");
sub randomUtf8Name()
{
return sprintf("%s", $utf8Name[int(rand(10))]);
}
sub getBulkCopyCmd(){
my $sourceDir= shift;
my $tableName = shift;
my $delimeter = shift;
$delimeter = '\t' if ( !$delimeter );
# . "\nCOPY $tableName FROM \'$sourceDir/$tableName' using DELIMITERS \'". '\t' . "\' WITH NULL AS '\n';";
my $cmd= "\nbegin transaction;"
. "\nCOPY $tableName FROM \'$sourceDir/$tableName' using DELIMITERS \'$delimeter\';"
. "\ncommit;"
. "\n";
return $cmd;
}
# main($)
{
# explicitly call srand so we get the same data every time
# we generate it. However, we set it individually for each table type.
# Otherwise we'd be generating the same data sets regardless of size,
# and this would really skew our joins.
my $filetype = shift;
my $numRows = shift;
my $tableName = shift;
my $targetDir= shift;
my $nosql = shift;
die usage() if (!defined($filetype) || !defined($numRows));
if ($numRows <= 0) { usage(); }
if ( $targetDir ) {
open(HDFS, "> $targetDir/$tableName") or die("Cannot open file $tableName, $!\n");
open(PSQL, "> $targetDir/$tableName.sql") or die("Cannot open file $tableName.sql, $!\n") unless defined $nosql;
} else {
open(HDFS, "> $tableName") or die("Cannot open file $tableName, $!\n");
open(PSQL, "> $tableName.sql") or die("Cannot open file $tableName.sql, $!\n") unless defined $nosql;
}
if ($filetype eq "manual") {
} elsif ($filetype eq "studenttab") {
srand(3.14159 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n" unless defined $nosql;
print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $gpa = randomGpa();
printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa;
}
} elsif ($filetype eq "studentnulltab") {
srand(3.14159 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n";
print PSQL "begin transaction;\n";
for (my $i = 0; $i < $numRows; $i++) {
# generate nulls in a random fashion
my $name = rand(1) < 0.05 ? '' : randomName();
my $age = rand(1) < 0.05 ? '' : randomAge();
my $gpa = rand(1) < 0.05 ? '' : randomGpa();
printf PSQL "insert into $tableName (name, age, gpa) values(";
print PSQL ($name eq ''? "null, " : "'$name', "), ($age eq ''? "null, " : "$age, ");
if($gpa eq '') {
print PSQL "null);\n"
} else {
printf PSQL "%.2f);\n", $gpa;
}
print HDFS "$name\t$age\t";
if($gpa eq '') {
print HDFS "\n"
} else {
printf HDFS "%.2f\n", $gpa;
}
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "studentcolon") {
srand(2.718281828459 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n" unless defined $nosql;
print PSQL &getBulkCopyCmd( $targetDir, $tableName, ':' ) unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $gpa = randomGpa();
printf HDFS "%s:%d:%.2f\n", $name, $age, $gpa;
=begin
} elsif ($filetype eq "studentusrdef") {
srand(6.62606896 + $numRows);
for (my $i = 0; $i < $numRows; $i++) {
# TODO need to add SQL info.
printf("%s,%d,%.2f,", randomName(), randomAge(), randomGpa());
printf("<%s,%s,%s,%d>,", randomStreet(), randomCity(), randomState(),
randomZip());
printf("[%s:<%s,%s>],", randomClass(), randomClass(), randomName());
printf("{");
my $elementsInBag = int(rand(100));
for (my $j = 0; $j < $elementsInBag; $j++) {
if ($j != 0) { printf(","); }
printf("<%s,%s,%s>", randomClass(), randomName(), randomGrade());
}
printf("}\n");
}
=cut
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "studentctrla") {
srand(6.14159 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n";
print PSQL "begin transaction;\n";
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $gpa = randomGpa();
printf PSQL "insert into $tableName (name, age, gpa) values('%s', %d, %.2f);\n",
$name, $age, $gpa;
printf HDFS "%s%d%.2f\n", $name, $age, $gpa;
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "studentcomplextab") {
srand(3.14159 + $numRows);
print PSQL "create table $tableName (nameagegpamap varchar(500), nameagegpatuple varchar(500), nameagegpabag varchar(500), nameagegpamap_name varchar(500), nameagegpamap_age integer, nameagegpamap_gpa float(3));\n";
print PSQL "begin transaction;\n";
for (my $i = 0; $i < $numRows; $i++) {
# generate nulls in a random fashion
my $map = rand(1) < 0.05 ? '' : randomNameAgeGpaMap();
my $tuple = rand(1) < 0.05 ? '' : randomNameAgeGpaTuple();
my $bag = rand(1) < 0.05 ? '' : randomNameAgeGpaBag();
printf PSQL "insert into $tableName (nameagegpamap, nameagegpatuple, nameagegpabag, nameagegpamap_name, nameagegpamap_age, nameagegpamap_gpa) values(";
my $mapHash;
if($map ne '') {
$mapHash = getMapFields($map);
}
print PSQL ($map eq ''? "null, " : "'$map', "),
($tuple eq ''? "null, " : "'$tuple', "),
($bag eq '' ? "null, " : "'$bag', "),
($map eq '' ? "null, " : (exists($mapHash->{'name'}) ? "'".$mapHash->{'name'}."', " : "null, ")),
($map eq '' ? "null, " : (exists($mapHash->{'age'}) ? "'".$mapHash->{'age'}."', " : "null, ")),
($map eq '' ? "null);\n" : (exists($mapHash->{'gpa'}) ? "'".$mapHash->{'gpa'}."');\n" : "null);\n"));
print HDFS "$map\t$tuple\t$bag\n";
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "votertab") {
srand(299792458 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n" unless defined $nosql;
print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomName();
my $age = randomAge();
my $registration = randomRegistration();
my $contribution = randomContribution();
printf HDFS "%s\t%d\t%s\t%.2f\n", $name, $age,
$registration, $contribution;
}
} elsif ($filetype eq "voternulltab") {
srand(299792458 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n" unless defined $nosql;
print PSQL "begin transaction;\n" unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
# generate nulls in a random fashion
my $name = rand(1) < 0.05 ? '' : randomName();
my $age = rand(1) < 0.05 ? '' : randomAge();
my $registration = rand(1) < 0.05 ? '' : randomRegistration();
my $contribution = rand(1) < 0.05 ? '' : randomContribution();
printf PSQL "insert into $tableName (name, age, registration, contributions) values(";
print PSQL ($name eq ''? "null, " : "'$name', "),
($age eq ''? "null, " : "$age, "),
($registration eq ''? "null, " : "'$registration', ");
if($contribution eq '') {
print PSQL "null);\n"
} else {
printf PSQL "%.2f);\n", $contribution;
}
print HDFS "$name\t$age\t$registration\t";
if($contribution eq '') {
print HDFS "\n"
} else {
printf HDFS "%.2f\n", $contribution;
}
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "reg1459894") {
srand(6.67428 + $numRows);
print PSQL "create table $tableName (first varchar(10), second varchar(10));\n" unless defined $nosql;
print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $letter = randomNumLetter();
my $gkLetter = randomGreekLetter();
printf HDFS "%s\t%s\n", $letter, $gkLetter;
}
} elsif ($filetype eq "textdoc") {
# This one ignores the number of lines. It isn't random either.
print PSQL "create table $tableName (name varchar(255));\n" unless defined $nosql;
print PSQL "begin transaction;\n" unless defined $nosql;
for (my $i = 0; $i < @textDoc; $i++) {
my $sqlWords = $textDoc[$i];
$sqlWords =~ s/([\w-]+)/$1,/g;
print PSQL "insert into $tableName (name) values('($sqlWords)');\n" unless defined $nosql;
print HDFS "$textDoc[$i]\n";
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "unicode") {
srand(1.41421 + $numRows);
print PSQL "create table $tableName (name varchar(255));\n" unless defined $nosql;
print PSQL "begin transaction;\n" unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomUnicodeNonAscii();
printf PSQL "insert into $tableName (name) values('%s');\n",
$name unless defined $nosql;
printf HDFS "%s\n", $name;
}
print PSQL "commit;\n" unless defined $nosql;
} elsif ($filetype eq "allscalar") {
srand(1228.2011 + $numRows);
for (my $i = 0; $i < $numRows; $i++) {
my $name = rand(1) < 0.05 ? '' : randomName();
my $age = rand(1) < 0.05 ? '' : randomAge();
my $gpa = rand(1) < 0.05 ? '' : randomGpa();
my $instate = rand(1) < 0.05 ? '' : (rand(1) < 0.5 ? 'true' : 'false');
printf HDFS "%s\t%d\t%.2f\t%s\n", $name, $age, $gpa, $instate;
}
} elsif ($filetype eq "numbers") {
srand(2012 + $numRows);
for (my $i = 0; $i < $numRows; $i++) {
my $tid = ($i/1000+1) * 1000;
my $rand5 = int(rand(5)) + 1;
my $rand100 = int(rand(100)) + 1;
my $rand1000 = int(rand(1000)) + 1;
my $randf = rand(10);
printf HDFS "%d:%d:%d:%d:%d:%dL:%.2ff:%.2f\n", $tid, $i, $rand5, $rand100, $rand1000, $rand1000, $randf, $randf;
}
} elsif ($filetype eq "ranking") {
for (my $i = 0; $i < $numRows; $i++) {
my $tuple = $rankedTuples[int($i)];
printf HDFS "$tuple,";
for my $j ( 0 .. 1000000) {
printf HDFS "%d",$j;
}
printf HDFS "\n";
}
} elsif ($filetype eq "biggish") {
for (my $i = 1; $i < $numRows; $i++) {
printf HDFS "$i,$i,";
for my $j ( 0 .. 1000) {
printf HDFS "%d",$j;
}
printf HDFS "\n";
}
} elsif ($filetype eq "utf8Student") {
srand(3.14159 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n" unless defined $nosql;
print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomUtf8Name();
my $age = randomAge();
my $gpa = randomGpa();
printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa;
}
} elsif ($filetype eq "utf8Voter") {
srand(3.14159 + $numRows);
print PSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n" unless defined $nosql;
print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql;
for (my $i = 0; $i < $numRows; $i++) {
my $name = randomUtf8Name();
my $age = randomAge();
my $registration = randomRegistration();
my $contribution = randomContribution();
printf HDFS "%s\t%d\t%s\t%.2f\n", $name, $age,
$registration, $contribution;
}
} else {
warn "Unknown filetype $filetype\n";
usage();
}
}