blob: 97f6c05e18948c3446fd369e601f26f0d63998ad [file] [log] [blame]
#!/usr/bin/env perl
############################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
$cfg = {
'driver' => 'Pig',
'groups' => [
{
'name' => 'BloomJoin_Map',
'execonly' => 'tez',
'tests' => [
{
# Tuple join key
'num' => 1,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
--c = filter a by age < 20;
--d = filter b by age < 20;
e = join a by (name, age), b by (name, age) using 'bloom';
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
--c = filter a by age < 20;
--d = filter b by age < 20;
e = join a by (name, age), b by (name, age);
store e into ':OUTPATH:';\,
},
{
# bytearray join key
'num' => 2,
'pig' => q\
SET mapreduce.input.fileinputformat.split.maxsize '50000';
SET pig.splitCombination false;
a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
c = filter a by age < 20;
d = filter b by age < 20;
e = join c by name, d by name using 'bloom';
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
c = filter a by age < 20;
d = filter b by age < 20;
e = join c by name, d by name;
store e into ':OUTPATH:';\,
},
{
# Left outer join and chararray join key
'num' => 3,
'pig' => q\
SET mapreduce.input.fileinputformat.split.maxsize '50000';
SET pig.splitCombination false;
a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions);
c = join a by name left, b by name using 'bloom';
d = foreach c generate a::name, a::age, gpa, registration, contributions;
store d into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions);
c = join a by name left, b by name;
d = foreach c generate a::name, a::age, gpa, registration, contributions;
store d into ':OUTPATH:';\,
},
{
# Right outer join
'num' => 4,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions);
c = join a by (name,age) right, b by (name,age) using 'bloom';
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions);
c = join a by (name,age) right, b by (name,age);
store c into ':OUTPATH:';\,
},
{
# Left input from a union
'num' => 5,
'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
d = filter d by age > 60;
e = join c by name, d by name using 'bloom' PARALLEL 3;
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
d = filter d by age > 60;
e = join c by name, d by name;
store e into ':OUTPATH:';\,
},
{
# Right input from a union and integer join key
'num' => 6,
'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
c = filter c by age > 75;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
e = join d by age, c by age using 'bloom' PARALLEL 3;
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
c = filter c by age > 75;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
e = join d by age, c by age;
store e into ':OUTPATH:';\,
},
{
# Left input from a split
'num' => 7,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
b = filter b by age > 75;
c = filter a by age > 50;
d = join a by age, b by age using 'bloom';
store c into ':OUTPATH:.1';
store d into ':OUTPATH:.2';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
b = filter b by age > 75;
c = filter a by age > 50;
d = join a by age, b by age;
store c into ':OUTPATH:.1';
store d into ':OUTPATH:.2';\,
},
{
# Right input from a split
'num' => 8,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
c = filter a by age > 75;
d = filter a by name == 'nick miller';
e = join b by age, c by age using 'bloom';
store d into ':OUTPATH:.1';
store e into ':OUTPATH:.2';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
c = filter a by age > 75;
d = filter a by name == 'nick miller';
e = join b by age, c by age;
store d into ':OUTPATH:.1';
store e into ':OUTPATH:.2';\,
},
] # end of tests
},
{
'name' => 'BloomJoin_Reduce',
'execonly' => 'tez',
'java_params' => ['-Dpig.bloomjoin.strategy=reduce'],
'tests' => [
{
# Tuple join key
'num' => 1,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
--c = filter a by age < 20;
--d = filter b by age < 20;
e = join a by (name, age), b by (name, age) using 'bloom';
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
--c = filter a by age < 20;
--d = filter b by age < 20;
e = join a by (name, age), b by (name, age);
store e into ':OUTPATH:';\,
},
{
# bytearray join key
'num' => 2,
'pig' => q\
SET mapreduce.input.fileinputformat.split.maxsize '50000';
SET pig.splitCombination false;
a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
c = filter a by age < 20;
d = filter b by age < 20;
e = join c by name, d by name using 'bloom';
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions);
c = filter a by age < 20;
d = filter b by age < 20;
e = join c by name, d by name;
store e into ':OUTPATH:';\,
},
{
# Left outer join and chararray join key
'num' => 3,
'pig' => q\
SET mapreduce.input.fileinputformat.split.maxsize '50000';
SET pig.splitCombination false;
a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions);
c = join a by name left, b by name using 'bloom';
d = foreach c generate a::name, a::age, gpa, registration, contributions;
store d into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions);
c = join a by name left, b by name;
d = foreach c generate a::name, a::age, gpa, registration, contributions;
store d into ':OUTPATH:';\,
},
{
# Right outer join
'num' => 4,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions);
c = join a by (name,age) right, b by (name,age) using 'bloom';
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions);
c = join a by (name,age) right, b by (name,age);
store c into ':OUTPATH:';\,
},
{
# Left input from a union
'num' => 5,
'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
d = filter d by age > 60;
e = join c by name, d by name using 'bloom' PARALLEL 3;
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
d = filter d by age > 60;
e = join c by name, d by name;
store e into ':OUTPATH:';\,
},
{
# Right input from a union and integer join key
'num' => 6,
'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
c = filter c by age > 75;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
e = join d by age, c by age using 'bloom' PARALLEL 3;
store e into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa);
b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa);
c = union a, b;
c = filter c by age > 75;
d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions);
e = join d by age, c by age;
store e into ':OUTPATH:';\,
},
{
# Left input from a split
'num' => 7,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
b = filter b by age > 75;
c = filter a by age > 50;
d = join a by age, b by age using 'bloom';
store c into ':OUTPATH:.1';
store d into ':OUTPATH:.2';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
b = filter b by age > 75;
c = filter a by age > 50;
d = join a by age, b by age;
store c into ':OUTPATH:.1';
store d into ':OUTPATH:.2';\,
},
{
# Right input from a split
'num' => 8,
'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
c = filter a by age > 75;
d = filter a by name == 'nick miller';
e = join b by age, c by age using 'bloom';
store d into ':OUTPATH:.1';
store e into ':OUTPATH:.2';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa);
b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions);
c = filter a by age > 75;
d = filter a by name == 'nick miller';
e = join b by age, c by age;
store d into ':OUTPATH:.1';
store e into ':OUTPATH:.2';\,
},
] # end of tests
}
] # end of groups
};