| $cfg = { |
| 'driver' => 'Pig', |
| 'nummachines' => 5, |
| 'verify_with_pig' => 1, |
| 'verify_pig_version' => 'old', |
| |
| 'groups' => [ |
| { |
| 'name' => 'Orc', |
| 'tests' => [ |
| # Test 1: Load (primitive) from PigStorage and store into OrcStorage |
| # Also tests multiple load stores in same script |
| { |
| 'num' => 1, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| store a into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| c = filter b by age < 30; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = filter a by age < 30; |
| store b into ':OUTPATH:';\, |
| }, |
| # Test 2: Load (complex) from PigStorage and store into OrcStorage |
| { |
| 'num' => 2, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)}); |
| store a into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)}); |
| store a into ':OUTPATH:';\, |
| }, |
| # Test 3: Aggregation test using two ORCStorage datasets |
| # Also incorporates handling bytearrays |
| { |
| 'num' => 3, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); |
| store a into ':OUTPATH:.simple.intermediate' using OrcStorage(); |
| exec |
| b = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)}, nameagegpamap_name:chararray, nameagegpamap_age:int, nameagegpamap_gpa:float); |
| store b into ':OUTPATH:.complex.intermediate' using OrcStorage(); |
| exec |
| c = load ':OUTPATH:.simple.intermediate' using OrcStorage(); |
| d = load ':OUTPATH:.complex.intermediate' using OrcStorage(); |
| e = foreach c generate name, age, gpa; |
| f = foreach d generate nameagegpamap#'name' as name, nameagegpamap#'age' as age, nameagegpamap#'gpa' as gpa, nameagegpatuple.tage as tage, FLATTEN(nameagegpabag) as (bname, bage, bgpa); |
| g = join e by name, f by name; |
| h = group g by (f::bgpa); |
| j = foreach h generate group, COUNT(g) as students; |
| k = order j by group, students; |
| store k into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); |
| b = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)}, nameagegpamap_name:chararray, nameagegpamap_age:int, nameagegpamap_gpa:float); |
| c = foreach a generate name, age, gpa; |
| d = foreach b generate nameagegpamap#'name' as name, nameagegpamap#'age' as age, nameagegpamap#'gpa' as gpa, nameagegpatuple.tage as tage, FLATTEN(nameagegpabag) as (bname, bage, bgpa); |
| e = join c by name, d by name; |
| f = group e by (d::bgpa); |
| g = foreach f generate group, COUNT(e) as students; |
| h = order g by group, students; |
| store h into ':OUTPATH:';\, |
| }, |
| # Tests 4 : Test various properties passed to ORCStorage |
| { |
| 'num' => 4, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| store a into ':OUTPATH:.orc_params.intermediate' using OrcStorage('-c ZLIB -s 67108864 -r 100000 -b 1048576 -p true -v 0.12'); |
| exec |
| b = load ':OUTPATH:.orc_params.intermediate' using OrcStorage(); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| store a into ':OUTPATH:';\, |
| }, |
| # Tests 5 : Test loading null map key |
| { |
| 'num' => 5, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age); |
| b = foreach a generate TOMAP(name, age) as m; |
| store b into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| c = load ':OUTPATH:.intermediate' using OrcStorage(); |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age); |
| b = foreach a generate (name is null ? [] : TOMAP(name, age)); |
| store b into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Orc_Pushdown', |
| 'tests' => [ |
| # Test 1: Load (primitive) from PigStorage and store into OrcStorage |
| # Also tests multiple load stores in same script |
| { |
| 'num' => 1, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = order a by name parallel 4; |
| store b into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| c = filter b by name < 'david falkner'; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = filter a by name < 'david falkner'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'notmq' => 1, |
| 'execonly' => 'mapred,tez', # studenttab20m not available in local mode |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab20m' as (name:chararray, age:int, gpa:float); |
| b = order a by age desc parallel 4; |
| store b into ':OUTPATH:.intermediate' using OrcStorage('-s 10000000'); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| c = filter b by age <= 22; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab20m' as (name:chararray, age:int, gpa:float); |
| b = filter a by age <= 22; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = order a by gpa parallel 4; |
| store b into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| c = filter b by gpa >= 3.2 and gpa < 3.5 and age > 30 + 2; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = filter a by gpa >= 3.2 and gpa < 3.5 and age > 30 + 2; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:bigdecimal); |
| b = order a by gpa parallel 4; |
| store b into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| c = filter b by gpa >= 3.5; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:bigdecimal); |
| b = filter a by gpa >= 3.5; |
| store b into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| 'num' => 5, |
| 'notmq' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate name, age, gpa, (age>35 ? ToDate('20100101', 'yyyyMMdd', 'UTC') : ToDate('20100105', 'yyyyMMdd', 'UTC')) as d; |
| c = order b by d parallel 4; |
| store c into ':OUTPATH:.intermediate' using OrcStorage(); |
| exec |
| b = load ':OUTPATH:.intermediate' using OrcStorage(); |
| c = filter b by d >= ToDate('20100103', 'yyyyMMdd', 'UTC'); |
| d = foreach c generate name, age, gpa; |
| store d into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:bigdecimal); |
| b = filter a by age<=35; |
| store b into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| ] |
| }, |
| |
| ] |
| }; |
| |