| #!/usr/bin/env perl |
| ############################################################################ |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| ############################################################################### |
| |
| $cfg = { |
| 'driver' => 'Pig', |
| |
| 'groups' => [ |
| { |
| 'name' => 'BloomJoin_Map', |
| 'execonly' => 'tez', |
| 'tests' => [ |
| { |
| # Tuple join key |
| 'num' => 1, |
| 'pig' => q\ |
| SET pig.bloomjoin.num.filters 5; |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| --c = filter a by age < 20; |
| --d = filter b by age < 20; |
| e = join a by (name, age), b by (name, age) using 'bloom'; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| --c = filter a by age < 20; |
| --d = filter b by age < 20; |
| e = join a by (name, age), b by (name, age); |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # bytearray join key |
| 'num' => 2, |
| 'pig' => q\ |
| SET mapreduce.input.fileinputformat.split.maxsize '50000'; |
| SET pig.splitCombination false; |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by name, d by name using 'bloom'; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by name, d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Left outer join and chararray join key |
| 'num' => 3, |
| 'pig' => q\ |
| SET mapreduce.input.fileinputformat.split.maxsize '50000'; |
| SET pig.splitCombination false; |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); |
| c = join a by name left, b by name using 'bloom'; |
| d = foreach c generate a::name, a::age, gpa, registration, contributions; |
| store d into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); |
| c = join a by name left, b by name; |
| d = foreach c generate a::name, a::age, gpa, registration, contributions; |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # Right outer join |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); |
| c = join a by (name,age) right, b by (name,age) using 'bloom'; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); |
| c = join a by (name,age) right, b by (name,age); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # Left input from a union |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| d = filter d by age > 60; |
| e = join c by name, d by name using 'bloom' PARALLEL 3; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| d = filter d by age > 60; |
| e = join c by name, d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Right input from a union and integer join key |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| c = filter c by age > 75; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join d by age, c by age using 'bloom' PARALLEL 3; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| c = filter c by age > 75; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join d by age, c by age; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Left input from a split |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| b = filter b by age > 75; |
| c = filter a by age > 50; |
| d = join a by age, b by age using 'bloom'; |
| store c into ':OUTPATH:.1'; |
| store d into ':OUTPATH:.2';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| b = filter b by age > 75; |
| c = filter a by age > 50; |
| d = join a by age, b by age; |
| store c into ':OUTPATH:.1'; |
| store d into ':OUTPATH:.2';\, |
| }, |
| { |
| # Right input from a split |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| c = filter a by age > 75; |
| d = filter a by name == 'nick miller'; |
| e = join b by age, c by age using 'bloom'; |
| store d into ':OUTPATH:.1'; |
| store e into ':OUTPATH:.2';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| c = filter a by age > 75; |
| d = filter a by name == 'nick miller'; |
| e = join b by age, c by age; |
| store d into ':OUTPATH:.1'; |
| store e into ':OUTPATH:.2';\, |
| }, |
| ] # end of tests |
| }, |
| { |
| 'name' => 'BloomJoin_Reduce', |
| 'execonly' => 'tez', |
| 'java_params' => ['-Dpig.bloomjoin.strategy=reduce'], |
| 'tests' => [ |
| { |
| # Tuple join key |
| 'num' => 1, |
| 'pig' => q\ |
| SET pig.bloomjoin.num.filters 5; |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| --c = filter a by age < 20; |
| --d = filter b by age < 20; |
| e = join a by (name, age), b by (name, age) using 'bloom'; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| --c = filter a by age < 20; |
| --d = filter b by age < 20; |
| e = join a by (name, age), b by (name, age); |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # bytearray join key |
| 'num' => 2, |
| 'pig' => q\ |
| SET mapreduce.input.fileinputformat.split.maxsize '50000'; |
| SET pig.splitCombination false; |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by name, d by name using 'bloom'; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by name, d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Left outer join and chararray join key |
| 'num' => 3, |
| 'pig' => q\ |
| SET mapreduce.input.fileinputformat.split.maxsize '50000'; |
| SET pig.splitCombination false; |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); |
| c = join a by name left, b by name using 'bloom'; |
| d = foreach c generate a::name, a::age, gpa, registration, contributions; |
| store d into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age, registration, contributions); |
| c = join a by name left, b by name; |
| d = foreach c generate a::name, a::age, gpa, registration, contributions; |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # Right outer join |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); |
| c = join a by (name,age) right, b by (name,age) using 'bloom'; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:int, registration, contributions); |
| c = join a by (name,age) right, b by (name,age); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # Left input from a union |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| d = filter d by age > 60; |
| e = join c by name, d by name using 'bloom' PARALLEL 3; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| d = filter d by age > 60; |
| e = join c by name, d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Right input from a union and integer join key |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| c = filter c by age > 75; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join d by age, c by age using 'bloom' PARALLEL 3; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age:int, gpa); |
| c = union a, b; |
| c = filter c by age > 75; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join d by age, c by age; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Left input from a split |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| b = filter b by age > 75; |
| c = filter a by age > 50; |
| d = join a by age, b by age using 'bloom'; |
| store c into ':OUTPATH:.1'; |
| store d into ':OUTPATH:.2';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| b = filter b by age > 75; |
| c = filter a by age > 50; |
| d = join a by age, b by age; |
| store c into ':OUTPATH:.1'; |
| store d into ':OUTPATH:.2';\, |
| }, |
| { |
| # Right input from a split |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| c = filter a by age > 75; |
| d = filter a by name == 'nick miller'; |
| e = join b by age, c by age using 'bloom'; |
| store d into ':OUTPATH:.1'; |
| store e into ':OUTPATH:.2';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name, age:int, registration, contributions); |
| c = filter a by age > 75; |
| d = filter a by name == 'nick miller'; |
| e = join b by age, c by age; |
| store d into ':OUTPATH:.1'; |
| store e into ':OUTPATH:.2';\, |
| }, |
| ] # end of tests |
| } |
| ] # end of groups |
| }; |