| #!/usr/bin/env perl |
| |
| ############################################################################ |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| ############################################################################### |
| # Nightly tests for pig. |
| # |
| # |
| |
| #PigSetup::setup(); |
| |
| #my $me = `whoami`; |
| #chomp $me; |
| |
| $cfg = { |
| 'driver' => 'Pig', |
| 'nummachines' => 5, |
| 'verify_with_pig' => 1, |
| 'verify_pig_version' => 'old', |
| |
| 'groups' => [ |
| { |
| 'name' => 'Checkin', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| store a into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 50; |
| d = filter b by age < 50; |
| e = cogroup c by (name, age), d by (name, age) ; |
| f = foreach e generate flatten(c), flatten(d); |
| g = group f by registration; |
| h = foreach g generate group, SUM(f.d::contributions); |
| i = order h by $1; |
| store i into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'sortArgs' => ['-t', ' ', '-k', '2,2n'], |
| } |
| ] |
| }, |
| { |
| 'name' => 'LoaderDefaultDir', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/dir/studenttab10k' as (name, age, gpa); |
| store a into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'LoaderPigStorageArg', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| store a into ':OUTPATH:';\, |
| }, |
| { |
| # load with control character |
| 'num' => 2, |
| 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa); |
| store a into ':OUTPATH:';#, |
| }, |
| { |
| # load and store with control character |
| 'num' => 3, |
| 'pig' => q#a = load ':INPATH:/singlefile/studentctrla10k' using PigStorage('\\u0001') as (name, age, gpa); |
| store a into ':OUTPATH:.intermediate' using PigStorage('\\u0001'); |
| b = load ':OUTPATH:.intermediate' using PigStorage('\\u0001') as (name, age, gpa); |
| store b into ':OUTPATH:'; #, |
| 'notmq' => 1, |
| }, |
| ] |
| }, |
| { |
| # Results doctored, if you change this query you need to copy the |
| # expected results into test/nightly/benchmarks |
| 'name' => 'LoaderBinStorage', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Swap(name, age), TOKENIZE((chararray)name), org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age); |
| store b into ':OUTPATH:.intermediate' using BinStorage(); |
| c = load ':OUTPATH:.intermediate' using BinStorage(); |
| store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, |
| 'notmq' => 1, |
| }, |
| ] |
| }, |
| { |
| # Results doctored, if you change this query you need to copy the |
| # expected results into test/nightly/benchmarks |
| 'name' => 'LoaderTextLoader', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/textdoc' using TextLoader(); |
| b = foreach a generate TOKENIZE((chararray)$0); |
| store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'FilterBoolean', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name == 'fred allen' and age > 50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/dir/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name != 'fred allen' or age < 10; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by not (age == 50); |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (age >= 50 or name > 'fred') and (gpa <= 3.0 or name >= 'bob'); |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by age >= 50 or name > 'fred' and gpa <= 3.0 or name >= 'bob'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter <= and >= for chararray, int and double |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter <= and >= for bytearray, long and float |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); |
| b = filter a by age >= 40 and age <=50 and gpa >= 2.0f and gpa <= 3.0f and name >= 'bob' and name <= 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter < and > for chararray, int and double |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = filter a by age > 40 and age <50 and gpa > 2.0 and gpa < 3.0 and name > 'bob' and name < 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter < and > for bytearray, long and float |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); |
| b = filter a by age > 40 and age <50 and gpa > 2.0f and gpa < 3.0f and name > 'bob' and name < 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter <= and >= for explicit cast for chararray, int and double |
| { |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (int)age >= 40 and (int)age <=50 and (double)gpa >= 2.0 and (double)gpa <= 3.0 and (chararray)name >= 'bob' and (chararray)name <= 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter <= and >= for explicit cast for bytearray, long and float |
| { |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (long)age >= 40 and (long)age <=50 and (float)gpa >= 2.0f and (float)gpa <= 3.0f and name >= 'bob' and name <= 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter < and > for explicit cast for chararray, int and double |
| { |
| 'num' => 12, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test filter < and > for explicit cast for bytearray, long and float |
| { |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (long)age > 40 and (long)age <50 and (float)gpa > 2.0f and (float)gpa < 3.0f and name > 'bob' and name < 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test AND with nulls |
| { |
| 'num' => 14, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name == 'fred allen' and age > 50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test OR with nulls |
| { |
| 'num' => 15, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name != 'fred allen' or age < 10; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test with nulls filter <= and >= for chararray, int and double |
| { |
| 'num' => 16, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = filter a by age >= 40 and age <=50 and gpa >= 2.0 and gpa <= 3.0 and name >= 'bob' and name <= 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test with nulls filter < and > for explicit cast for chararray, int and double |
| { |
| 'num' => 17, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (int)age > 40 and (int)age <50 and (double)gpa > 2.0 and (double)gpa < 3.0 and (chararray)name > 'bob' and (chararray)name < 'fred'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 18, |
| 'ignore' => 1, # PIG-2593 this case is not supported as instate need to be declared as boolean |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate == 'true'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 19, |
| 'ignore' => 1, # PIG-2593 this case is not supported as instate need to be declared as boolean |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by not instate; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate == 'false'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 20, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate is null; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 21, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate == true; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate == 'true'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 22, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate == false; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name, age, gpa, instate); |
| b = filter a by instate == 'false'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 23, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = filter a by instate; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = filter a by instate == 'true'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 24, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = filter a by not instate; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = filter a by instate == 'false'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 25, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = filter a by instate is null; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = filter a by instate is null; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 26, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = filter a by instate == true; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = filter a by instate == 'true'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 27, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = filter a by instate == false; |
| store b into ':OUTPATH:' using PigStorage;\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = filter a by instate == 'false'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| ], |
| }, |
| { |
| 'name' => 'FilterEq', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name == 'alice johnson' and age == 64 and gpa == 3.99; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name > 'fred allen' and age > 40 and gpa > 2.50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name >= 'fred allen' and age >= 40 and gpa >= 2.50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name lt 'fred allen' and age < 40 and gpa < 2.50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name lte 'fred allen' and age <= 40 and gpa <= 2.50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage(); |
| b = filter a by $0 neq 'fred allen' and $1 != '40' and $2 != '2.50'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter == for chararray, int and double |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter == for bytearray, long and float |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); |
| b = filter a by name == 'fred allen' and age == 61 and gpa == 1.42f; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter != for chararray, int and double |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter != for bytearray, long and float |
| { |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:float); |
| b = filter a by $0 != 'fred allen' and $1 != 40 and $2 != 2.50f; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter == for explicit casts to chararray, int and double |
| { |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by (chararray)name == 'fred allen' and (int)age == 61 and (double)gpa == 1.42; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter == for explicit casts to bytearray, long and float |
| { |
| 'num' => 12, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name == 'fred allen' and (long)age == 61 and (float)gpa == 1.42f; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter != for explicit casts to chararray, int and double |
| { |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ; |
| b = filter a by (chararray)$0 != 'fred allen' and (int)$1 != 40 and (double)$2 != 2.50; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| # test for filter != for explicit casts to bytearray, long and float |
| { |
| 'num' => 14, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() ; |
| b = filter a by $0 != 'fred allen' and (long)$1 != 40 and (float)$2 != 2.50f; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'FilterMatches', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name matches '^fred.*'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage(); |
| b = filter a by not $0 matches '^fred.*'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| # test for filter on matches for chararray (declared and explicit cast) |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); |
| b = filter a by name matches '^fred.*' and (chararray)registration matches '^dem.*'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); |
| b = filter a by name matches 'f.ed' and (chararray)registration matches 'd.m'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double); |
| b = filter a by name matches 'f[^f]ed.*'; |
| store b into ':OUTPATH:' using PigStorage;\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '.*\\\\wan.*';\nstore b into ':OUTPATH:' using PigStorage;", |
| }, |
| { |
| 'num' => 7, |
| 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches '^e.*\\\\sc.*';\nstore b into ':OUTPATH:' using PigStorage;", |
| }, |
| { |
| 'num' => 8, |
| 'pig' => "a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name:chararray, age:int, registration, contributions:double);\nb = filter a by name matches 'ethan white';\nstore b into ':OUTPATH:' using PigStorage;", |
| }, |
| { |
| 'num' => 9, |
| 'pig' => "a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age, gpa);\nb = filter a by gpa matches '\\\\d\\\\.45';\nstore b into ':OUTPATH:' using PigStorage;", |
| }, |
| ] |
| }, |
| { |
| 'name' => 'FilterUdf', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = cogroup a by (name, age), b by (name, age); |
| d = filter c by not IsEmpty(a); |
| e = filter d by not IsEmpty(b); |
| f = foreach e generate flatten(a), flatten(b); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 50; |
| d = filter b by age < 50; |
| e = cogroup c by (name, age), d by (name, age); |
| f = filter e by COUNT(c)> 0 AND COUNT(d)>0; |
| store f into ':OUTPATH:';\, |
| 'rc' => 0 |
| }, |
| ] |
| }, |
| # TODO Group that don't flatten via Agg functions |
| { |
| 'name' => 'GroupAggFunc', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, COUNT(a.age); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = group a by $0; |
| c = foreach b generate group, COUNT(a.$1); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by (name, age); |
| c = foreach b generate group.name, group.age, COUNT(a.gpa); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a all; |
| c = foreach b generate COUNT(a.$0); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, SUM(a.age); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, SUM(a.gpa); |
| store c into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.age); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 9, |
| 'floatpostprocess' => 1, |
| 'ignore23' => 'I cannot get it right due to float precision, temporarily disable', |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.gpa); |
| store c into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, MIN(a.gpa); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, MAX(a.gpa); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 12, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by (name, age); |
| c = foreach b generate flatten(group), SUM(a.gpa); |
| store c into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by (name); |
| c = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| d = cogroup b by group, c by name; |
| e = foreach d generate flatten(group), SUM(c.gpa), COUNT(c.name); |
| store e into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| 'num' => 14, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = group a by (name); |
| e = foreach b generate COUNT(a.name); |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = group a by (name); |
| e = foreach b generate COUNT(a.name); |
| store e into ':OUTPATH:';\, |
| } |
| ], |
| }, |
| { |
| 'name' => 'MapPartialAgg', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, COUNT(a.age); |
| store c into ':OUTPATH:';\, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=true'] |
| }, |
| { |
| #multiquery with group in one sub query |
| 'num' => 2, |
| 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| b = filter a by age < 22; store b into ':OUTPATH:.1'; |
| c = group b by age; |
| d = foreach c generate group, SUM(b.gpa); |
| store d into ':OUTPATH:.2'; #, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=true'] |
| |
| }, |
| { |
| #multi query with two group on diff columns |
| 'num' => 3, |
| 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| g1 = group a by name; |
| f1 = foreach g1 generate group as name, MAX(a.gpa); |
| store f1 into ':OUTPATH:.1'; |
| g2 = group a by age; |
| f2 = foreach g2 generate group as age, AVG(a.gpa); |
| store f2 into ':OUTPATH:.2'; #, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=true'] |
| |
| }, |
| { |
| #multi query with three groups on diff columns, group key being an expression |
| 'num' => 4, |
| 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| g1 = group a by name; |
| f1 = foreach g1 generate group as name, MAX(a.gpa); |
| store f1 into ':OUTPATH:.1'; |
| g2 = group a by age%10; |
| f2 = foreach g2 generate group as age_mod10, AVG(a.gpa); |
| store f2 into ':OUTPATH:.2'; |
| g3 = group a by age; |
| f3 = foreach g3 generate group%10, AVG(a.gpa); |
| store f3 into ':OUTPATH:.3'; |
| g4 = group a by gpa; |
| f4 = foreach g4 generate group as gpa, COUNT(a); |
| store f4 into ':OUTPATH:.4'; |
| |
| #, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=true'] |
| |
| }, |
| { |
| #aggregation gets more than one tuple for every tuple from load func |
| |
| 'num' => 5, |
| 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| b = foreach a generate name, age, gpa, flatten(TOBAG(age,age)) as x; |
| c = group b by age; |
| d = foreach c generate group, AVG(b.gpa); |
| store d into ':OUTPATH:'; #, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=true'] |
| |
| }, |
| |
| { |
| #PIG-4707 Streaming and empty input |
| |
| 'num' => 6, |
| 'pig' => q# a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| b = group a by name; |
| c = foreach b generate flatten(a); |
| d = stream c through `cat` as (name, age, gpa); |
| e = filter d by name == 'nonexistent'; |
| SPLIT e into f if gpa > 2, g otherwise; |
| store f into ':OUTPATH:.1'; |
| store g into ':OUTPATH:.2'; |
| #, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=true'] |
| |
| }, |
| |
| ], |
| }, |
| { |
| 'name' => 'EvalFunc', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by name lt 'b'; |
| c = foreach b generate ARITY(name, age, gpa); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); |
| b = filter a by name lt 'b'; |
| c = foreach b generate TOKENIZE(name); |
| d = foreach c generate flatten($0); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by name lt 'b'; |
| c = foreach b generate org.apache.pig.test.udf.evalfunc.Swap(name, age); |
| store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by name lt 'b'; |
| c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, age); |
| store c into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = foreach a generate org.apache.pig.test.udf.evalfunc.TestBoolean(instate); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate (instate is null ? '' : (instate == 'true' ? 'false' : 'true')); |
| store b into ':OUTPATH:';\, |
| } |
| |
| ] |
| }, |
| # TODO DIFF |
| # TODO User defined grouping function |
| { |
| 'name' => 'CoGroupFlatten', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = cogroup c by name, d by name; |
| f = foreach e generate flatten (c), flatten(d); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by $1 < 20; |
| d = filter b by $1 < 20; |
| e = cogroup c by $0, d by $0; |
| f = foreach e generate flatten (c), flatten(d); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = cogroup c by (name, age), d by (name, age); |
| f = foreach e generate flatten (c), flatten(d); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| d = filter b by age < 20; |
| e = cogroup a by (name, age) inner, d by (name, age); |
| f = foreach e generate flatten (a), flatten(d); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| e = cogroup c by (name, age), b by (name, age) inner; |
| f = foreach e generate flatten (c), flatten(b); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = cogroup a by (name, age) inner, b by (name, age) inner; |
| f = foreach e generate flatten (a), flatten(b); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| # Test cogrouping data loaded from two separate loaders. We don't have any data that can join with studenttab that isn't also loaded with PigStorage, so the |
| # first step is an intermediate load and store using BinStorage. |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| store a into ':OUTPATH:.intermediate' using BinStorage(); |
| b = load ':OUTPATH:.intermediate' using BinStorage() as (name, age, gpa); |
| c = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = cogroup b by (name, age) inner, c by (name, age) inner; |
| f = foreach e generate flatten (b), flatten(c); |
| store f into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| |
| ] |
| }, |
| { |
| 'name' => 'CoGroup', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = cogroup a by name, b by name; |
| d = foreach c generate flatten(group), COUNT(a) + COUNT(b); |
| store d into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Join', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by name, d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by $0, d by $0; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by (name, age), d by (name, age); |
| store e into ':OUTPATH:';\, |
| }, |
| # self join with implict split |
| # JIRA PIG-429 |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = filter a by $1 > 25; |
| c = join a by $0, b by $0; |
| store c into ':OUTPATH:';\, |
| }, |
| # join with one input having schema and another without |
| # JIRA PIG-428 |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double); |
| another = load ':INPATH:/singlefile/studenttab10k'; |
| c = foreach another generate $0, $1+ 10, $2 + 10.0; |
| d = join a by $0, c by $0; |
| store d into ':OUTPATH:';\, |
| }, |
| # self join using fragment replicate join |
| # no types |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = join a by name, b by name using 'repl'; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = join a by name, b by name ; |
| store c into ':OUTPATH:';\, |
| }, |
| # self join using fragment replicate join |
| # with types and no cast for join key |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| c = join a by name, b by name using 'repl'; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| c = join a by name, b by name ; |
| store c into ':OUTPATH:';\, |
| |
| }, |
| # self join using fragment replicate join |
| # with types and cast for join key |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| c = join a by gpa, b by gpa using 'repl'; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa); |
| c = join a by gpa, b by gpa ; |
| store c into ':OUTPATH:';\, |
| |
| }, |
| # left outer join |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double); |
| c = join a by name left outer, b by name; |
| store c into ':OUTPATH:';\, |
| }, |
| # right outer join |
| { |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double); |
| c = join a by name right outer, b by name; |
| store c into ':OUTPATH:';\, |
| }, |
| # full outer join |
| { |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/voternulltab10k' as (name:chararray, age:long, registration:chararray, contributions:double); |
| c = join a by name full outer, b by name; |
| store c into ':OUTPATH:';\, |
| }, |
| # see PIG-1209 join package now uses internalcachedBag, so every tuple on reduce side in this test will spilled to disk. |
| { |
| 'num' => 12, |
| 'java_params' => ['-Dpig.cachedbag.memusage=0'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by name, d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by instate, d by instate parallel 5; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by instate, d by instate parallel 5; |
| store e into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Foreach', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate *; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = foreach a generate *; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = foreach a generate $0, $2; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test filter, projection, sort , duplicate elimination |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by age < 20; |
| c = group b by age; |
| d = foreach c { |
| cf = filter b by gpa < 3.0; |
| cp = cf.gpa; |
| cd = distinct cp; |
| co = order cd by $0; |
| generate group, flatten(co); |
| } |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # test flatten for map and scalar |
| 'num' => 6, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate flatten(name) as n, flatten(org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, gpa)) as m; |
| store b into ':OUTPATH:' using org.apache.pig.test.udf.storefunc.StringStore();\, |
| }, |
| { |
| # test flatten for UDF that returns bag with multiple tuples with multiple columns |
| 'num' => 7, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, flatten(org.apache.pig.test.udf.evalfunc.CreateTupleBag(age, gpa)) as foo; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age: int, gpa); |
| c = group a by name; |
| d = foreach c generate flatten(group), MAX(a.age) + MIN(a.age); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # test filter, projection, sort , duplicate elimination |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by age < 20; |
| c = group b by age; |
| d = foreach c { |
| cf = filter b by gpa >= 3.0 and gpa <= 3.5; |
| cp = cf.gpa; |
| cd = distinct cp; |
| co = order cd by $0; |
| generate group, flatten(co); |
| } |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # test filter, projection, sort , duplicate elimination |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by age < 20; |
| c = group b by age; |
| d = foreach c { |
| cf = filter b by (gpa == 4.0 or gpa != 2.0) and name > 'a'; |
| cp = cf.gpa; |
| cd = distinct cp; |
| co = order cd by $0; |
| generate group, flatten(co); |
| } |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # test filter, projection, sort , duplicate elimination |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by age < 20; |
| c = foreach b { |
| exp1 = age + gpa; |
| exp2 = exp1 + age; |
| generate exp1, exp2; |
| } |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # test a udf with no args |
| 'num' => 12, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, org.apache.pig.test.udf.evalfunc.Fred() as fred; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = foreach a generate *; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate *; |
| store b into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Order', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name; |
| c = order b by name; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = foreach a generate $1; |
| c = order b by $0; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate gpa; |
| c = order b by gpa; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = order a by *; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' '], |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, age; |
| c = order b by name, age; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,2'], |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by $0; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by $1; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2,2'], |
| }, |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by $0, $1; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,2'], |
| }, |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by $1, $0; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2,2', '-k', '1,1'], |
| }, |
| { |
| 'num' => 10, |
| 'ignore' => 'order by UDF is not supported', |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by * using org.apache.pig.test.udf.orderby.OrdDesc; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-r'], |
| }, |
| { |
| 'num' => 11, |
| 'ignore' => 'order by UDF is not supported', |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by $0 using org.apache.pig.test.udf.orderby.OrdDesc; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-r', '-k', '1,1'], |
| }, |
| { |
| 'num' => 12, |
| 'ignore' => 'order by UDF is not supported', |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| c = order a by $0, $1 using org.apache.pig.test.udf.orderby.OrdDesc; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-r', '-k', '1,2'], |
| }, |
| # ALERT All these tests with inner order bys aren't testing the inner |
| # ordering. We need to develop a sorting tool to do that. |
| { |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = group a by $0; |
| c = foreach b {c1 = order $1 by $1; generate flatten(c1); }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 14, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = group a by $0; |
| c = foreach b {c1 = order $1 by *; generate flatten(c1); }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 15, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| b = group a by $0; |
| c = foreach b {c1 = order $1 by * using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1); }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 16, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| b = group a by $0; |
| c = foreach b {c1 = order $1 by $1 using org.apache.pig.test.udf.orderby.OrdDesc; generate flatten(c1);}; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 17, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = group a by $0; |
| c = foreach b {c1 = order $1 by $1; generate flatten(c1), MAX($1.$1); }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # test to make sure the weighted range patitioning |
| # works correctly when a sort key value repeats across |
| # reduce partitions |
| 'num' => 18, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = order a by $1 parallel 100; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2,2'], |
| }, |
| { |
| 'num' => 19, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = foreach a generate instate; |
| c = order b by instate; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate instate; |
| c = order b by instate; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| 'num' => 20, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by name ASC, age DESC parallel 9; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2,2nr'], |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Distinct', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name; |
| c = distinct b; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = foreach a generate $1; |
| c = distinct b; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate gpa; |
| c = distinct b; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = distinct a; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, age; |
| c = distinct b; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b { aa = distinct a.age; generate group, COUNT(aa); } |
| store c into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Cross', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 19 and gpa < 1.0; |
| d = filter b by age < 19; |
| e = cross c, d; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 19 and gpa < 1.0; |
| d = filter b by age < 19; |
| e = cross c, d parallel 10; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\set default_parallel 10; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 19 and gpa < 1.0; |
| d = filter b by age < 19; |
| e = cross c, d; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 25; |
| d = filter b by age < 25; |
| e = cross c, d; |
| f = filter e by c::age < d::age; |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\ |
| set default_parallel 2 |
| a = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| b = foreach a generate registration; |
| c = distinct b; |
| d = group c all; |
| e = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| f = cross e, d; |
| g = foreach f generate $0, $1, $2, flatten($3); |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| b = foreach a generate registration; |
| c = distinct b; |
| d = group c all; |
| e = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| f = cross e, d; |
| g = foreach f generate $0, $1, $2, flatten($3); |
| store g into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Union', |
| 'tests' => [ |
| { |
| # Simple store |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = foreach a generate name, age; |
| d = foreach b generate name, age; |
| e = union c, d; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Groupby + Combiner |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = group c by name; |
| e = foreach d generate group, SUM(c.age); |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Groupby + Secondary key partitioner |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = group c by name; |
| d1 = group c by name; -- Two separate groupbys to ensure secondary key partitioner |
| e = foreach d { f = order c by age, gpa ; g = limit f 1; generate g; }; |
| h = foreach d1 { i = order c by age asc, gpa desc; j = limit i 1; generate j; }; |
| store e into ':OUTPATH:.1'; |
| store h into ':OUTPATH:.2';\, |
| }, |
| { |
| # Union + Orderby |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = order c by name PARALLEL 2; |
| store d into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| # Simple split + Union |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| split a into a1 if age < 50, a2 otherwise; |
| c = union a1, b; |
| d = order c by name PARALLEL 2; |
| store a2 into ':OUTPATH:.1'; |
| store d into ':OUTPATH:.2';\, |
| }, |
| { |
| # Union + Join |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join c by name, d by name PARALLEL 2; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Replicate Join left |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join c by name, d by name using 'replicated'; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Replicate Join right |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join d by name, c by name using 'replicated'; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Skewed Join left |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join c by name, d by name using 'skewed' PARALLEL 5; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Skewed Join right |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join d by name, c by name using 'skewed' PARALLEL 5; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| d = foreach a generate name, age; |
| e = foreach b generate name, age; |
| f = foreach c generate name, age; |
| g = union d, e; |
| h = union f, g; |
| i = group h by name; |
| i = foreach i generate group, SUM(h.age); |
| store i into ':OUTPATH:';\, |
| }, |
| { |
| # Union + operators |
| 'num' => 12, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age:int, gpa:double); |
| c = union a, b; |
| -- Exercise all expression operators -- |
| d = foreach c generate (name is not NULL? UPPER(name) : 'FNU LNU') as name, (age < 30 ? -1 : age) as age, (gpa is NULL ? 0.0 : ((gpa > 0.5 AND gpa < 1.0) ? 1 : gpa)) as gpa; |
| e = filter d by (name matches '.*MIKE.*') OR (NOT (gpa + 1.5 > 4)); |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Groupby + Replicate join |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = group c by name; |
| e = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| f = join d by group, e by name using 'replicated'; |
| g = foreach f generate group, flatten(c), name, age, registration, contributions; |
| store g into ':OUTPATH:';\, |
| }, |
| { |
| # Group by with Secondary Key + Union |
| 'num' => 14, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name:chararray, age, gpa); |
| c = group a by name; |
| d = foreach c { |
| sorted = order a by name,age,gpa; |
| lmt = limit sorted 1; |
| generate lmt as c1; |
| }; |
| e = foreach d generate flatten(c1) as (name:chararray, age, gpa); |
| f = group b by name; |
| g = foreach f { |
| sorted = order b by name,age,gpa; |
| lmt = limit sorted 1; |
| generate lmt as f1; |
| }; |
| h = foreach g generate flatten(f1) as (name:chararray, age, gpa); |
| i = union e, h; |
| j = order i by name parallel 1; |
| store j into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| # Union + Cross |
| 'num' => 15, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa:float); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa:float); |
| c = filter a by gpa >= 4; |
| d = cross a, c; |
| e = union b, d; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Distinct |
| 'num' => 16, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa); |
| c = union a, b; |
| d = distinct c; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # Union + Groupby + FILTER |
| 'num' => 17, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa:float); |
| b = load ':INPATH:/singlefile/studentcolon10k' using PigStorage(':') as (name, age, gpa:float); |
| c = group a by name; |
| d = group b by name; |
| e = union c, d; |
| e = foreach e generate $0, $1 as groupbag; |
| f = foreach e { |
| g = order $1 by age asc, gpa desc; |
| h = filter g by (gpa == 0 ? true : false); |
| generate group, h; }; |
| store f into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Bincond', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate name, (name matches 'yuri.*' ? age - 10 : (int)age); |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Glob', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10?' as (name, age, gpa); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/st*ttab10k' as (name, age, gpa); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/voter*' as (name, age, registration, contributions); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/student???10k' as (name, age, registration, contributions); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentta[a-z][1-9]0[!m],:INPATH:/singlefile/voter{,null}tab10k' as (name, age); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab[13]0[km]' as (name, age, gpa); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab[12]0[a-l]' as (name, age, gpa); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/glob/star/*good' as (name, age, gpa); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/glob/star/*' as (name, age, gpa); |
| b = filter a by name == 'nick miller'; |
| store b into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Arithmetic', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate age + 1, (int)gpa + 1; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate (double)age + 1.5, gpa + 1.5; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate age - 30, (int)gpa - 3; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate (double)age - 30.1, gpa - 3.199; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate age * 10, (int)gpa * 2; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate (double)age * 10.1, gpa * 2.752342; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate age / 30, (int)gpa / 3; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate (double)age / 30.323, gpa / 3.22; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate 3 * age + gpa / 9.1 - 2; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| c = foreach a generate 3 * (age + gpa) / (9.1 - 2); |
| store c into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Regression', |
| 'tests' => [ |
| { |
| 'num' => 1459894, |
| 'pig' => q\a = load ':INPATH:/singlefile/reg1459894'; |
| b = group a by $0; |
| c = foreach b generate group, COUNT(a.$1); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 97, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = cogroup a by name, b by name; |
| f = foreach e generate group, COUNT(a), COUNT(b); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 203, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, COUNT($1); |
| store c into ':OUTPATH:'; |
| --This is a really long script to test that when script size exceeds 1k we can still parse it. |
| --The quick sly fox jumped over the lazy brown dog. |
| --he quick sly fox jumped over the lazy brown dog.T |
| --e quick sly fox jumped over the lazy brown dog.Th |
| -- quick sly fox jumped over the lazy brown dog.The |
| --quick sly fox jumped over the lazy brown dog.The |
| --uick sly fox jumped over the lazy brown dog.The q |
| --ick sly fox jumped over the lazy brown dog.The qu |
| --ck sly fox jumped over the lazy brown dog.The qui |
| --k sly fox jumped over the lazy brown dog.The quic |
| -- sly fox jumped over the lazy brown dog.The quick |
| --sly fox jumped over the lazy brown dog.The quick |
| --ly fox jumped over the lazy brown dog.The quick s |
| --y fox jumped over the lazy brown dog.The quick sl |
| -- fox jumped over the lazy brown dog.The quick sly |
| --fox jumped over the lazy brown dog.The quick sly |
| --ox jumped over the lazy brown dog.The quick sly f |
| --x jumped over the lazy brown dog.The quick sly fo |
| -- jumped over the lazy brown dog.The quick sly fox |
| --jumped over the lazy brown dog.The quick sly fox |
| --umped over the lazy brown dog.The quick sly fox j |
| --mped over the lazy brown dog.The quick sly fox ju |
| --ped over the lazy brown dog.The quick sly fox jum\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Unicode', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/unicode100'; |
| store a into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Parameters', |
| 'tests' => [ |
| { |
| # test default |
| 'num' => 1, |
| 'pig' => q\%default fname 'studenttab10k' |
| a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test paramter from command line |
| 'num' => 2, |
| 'pig_params' => ['-p', qq(fname='studenttab10k')], |
| 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test paramter from param file |
| 'num' => 3, |
| 'pig_params' => ['-m', ":PARAMPATH:/params_3"], |
| 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test command |
| 'num' => 4, |
| 'pig' => q\%declare cmd `perl -e "print 'studenttab10k'"` |
| a = load ':INPATH:/singlefile/$cmd' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test parameter with a space |
| 'num' => 5, |
| 'pig_params' => ['-p', qq(setting='set default_parallel 100;'),'-p',qq(fname='studenttab10k')], |
| 'pig' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); |
| $setting |
| b = foreach a generate name; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/$fname' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Types_Constants', |
| 'tests' => [ |
| { |
| # constants |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate age + 1 + 0.2f + 253645L, gpa+1; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # constants |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate -(age + 1 + 0.2f + 253645L), -(gpa+1); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test precision for doubles is atleast 15 digits |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate 0.123456789123456+0.123456789123456; |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Types_Cast', |
| 'tests' => [ |
| { |
| # NULL and cast |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int; |
| c = foreach b generate (norm_gpa is null? 0 :norm_gpa); |
| store c into ':OUTPATH:';\, |
| # 'expected_err_regex' => "Encountered Warning DIVIDE_BY_ZERO 2387 time.*", |
| # Driver does currently not support both 'sql' and 'expected_...' verification directives. |
| }, |
| { |
| # Not NULL and cast |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate (int)((int)gpa/((int)gpa - 1)) as norm_gpa:int; |
| c = foreach b generate (norm_gpa is not null? norm_gpa: 0); |
| store c into ':OUTPATH:';\, |
| }, |
| # boolean cast |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = foreach a generate instate, true, false; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate instate, 'true', 'false'; |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Types_ArithmeticCast', |
| 'tests' => [ |
| { |
| # arithmetic operators and SIZE for int, double and size and concat operators for chararrays |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate age, gpa, age % 25, age + 25, age - 25, age/2, age * 2, SIZE(age), gpa + 10.1, gpa - 1.1 , gpa / 1.2, gpa * 2.5, SIZE(gpa), SIZE(name), CONCAT(name, 'test'); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # arithmetic operators and SIZE for long, float and size and concat operators for bytearrays |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); |
| b = foreach a generate age, gpa, age % 2L, age + 2500000000L, age - 2500000000L, age/2L, age * 250000000L, SIZE(age), gpa + 10.1f, gpa - 1.1f , gpa / 1.2f, gpa * 2.6f, SIZE(gpa), SIZE(name), CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # equality and implicit cast |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = filter a by age == '25' and gpa < 3; |
| store b into ':OUTPATH:';\, |
| |
| }, |
| { |
| # will need to test against previous version of pig |
| # because in pig currently count includes nulls - this affects |
| # avg |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = group a ALL; |
| c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), MIN(a.name), MAX(a.name), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa); |
| store c into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| # sum, min, max, avg for long and float (declared) |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); |
| b = group a ALL; |
| c = foreach b generate SUM(a.age), MIN(a.age), MAX(a.age), AVG(a.age), SUM(a.gpa), MIN(a.gpa), MAX(a.gpa), AVG(a.gpa); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # Explicit casts - arithmetic operators and SIZE for int, double and size and concat operators for chararrays |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = foreach a generate (int)age % 25, (int)age + 25, (int)age - 25, (int)age/2, (int)age * 2, SIZE((int)age), (double)gpa + 10.1, (double)gpa - 1.1 , (double)gpa / 1.2, (double)gpa * 2.5, SIZE((double)gpa), SIZE((chararray)name), CONCAT((chararray)name, 'test'); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # Explicit casts - arithmetic operators and SIZE for long, float |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| b = foreach a generate (long)age, (long)age % 2L, (long)age + 2500000000L, (long)age - 2500000000L, (long)age/2L, (long)age * 250000000L, SIZE((long)age), (float)gpa + 10.1f, (float)gpa - 1.1f , (float)gpa / 1.2f, (float)gpa * 2.6f, SIZE((float)gpa); |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Types_Filter', |
| 'tests' => [ |
| { |
| # Filter is null for chararray and double and is not null for int |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = filter a by name is null and age is not null and gpa is null; |
| c = group b ALL; |
| d = foreach c generate COUNT(b); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # Filter is not null for chararray and double and is null for int |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = filter a by name is not null and age is null and gpa is not null; |
| c = group b ALL; |
| d = foreach c generate COUNT(b); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # Filter is null for bytearray and float and is not null for long |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); |
| b = filter a by name is null and age is not null and gpa is null; |
| c = group b ALL; |
| d = foreach c generate COUNT(b); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| # Filter is not null for bytearray and float and is null for long |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); |
| b = filter a by name is not null and age is null and gpa is not null; |
| c = group b ALL; |
| d = foreach c generate COUNT(b); |
| store d into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Types_Order', |
| 'tests' => [ |
| { |
| # test that sorting is based on the type for chararray, int and double |
| 'num' => 1, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by name, age, gpa; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,3n'], |
| }, |
| { |
| # test that sorting descending is based on the type for chararray, int and double |
| 'num' => 2, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by name desc, age desc, gpa desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,3nr'], |
| }, |
| { |
| # test that sorting is based on the type for bytearray, long and float |
| 'num' => 3, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); |
| b = order a by name, age, gpa; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,3n'], |
| }, |
| { |
| # test that sorting descending is based on the type for chararray, age and float |
| 'num' => 4, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age:long, gpa:float); |
| b = order a by name desc, age desc, gpa desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,3nr'], |
| }, |
| { |
| # order by string |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by name; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| # order by string desc |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by name desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1r,1r'], |
| }, |
| { |
| # order by int |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by age; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2n,2n'], |
| }, |
| { |
| # order by int desc |
| 'num' => 8, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by age desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2nr,2nr'], |
| }, |
| { |
| # order by long |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double); |
| b = order a by age; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2n,2n'], |
| }, |
| { |
| # order by long desc |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:long, gpa:double); |
| b = order a by age desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '2nr,2nr'], |
| }, |
| { |
| # order by float |
| 'num' => 11, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); |
| b = order a by gpa; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '3n'], |
| }, |
| { |
| # order by float desc |
| 'num' => 12, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); |
| b = order a by gpa desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '3nr'], |
| }, |
| { |
| # order by double |
| 'num' => 13, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by gpa; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '3n'], |
| }, |
| { |
| # order by double desc |
| 'num' => 14, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by gpa desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '3nr'], |
| }, |
| { |
| # order by * |
| 'num' => 15, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by *; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1', '-k', '2n,3n'], |
| }, |
| { |
| # order by * desc |
| 'num' => 16, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| b = order a by * desc; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1r,1r', '-k', '2nr,3nr'], |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Types_CoGroup', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = cogroup c by name, d by name; |
| f = foreach e generate flatten (c), flatten(d); |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:double); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = cogroup c by age, d by age; |
| f = foreach e generate flatten (c), flatten(d); |
| store f into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| |
| { |
| 'name' => 'Limit', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = order a by $0, $1; |
| c = filter b by $0 > 'a'; -- break the sort/limit optimization |
| d = limit c 100; |
| store d into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,2'], |
| }, |
| { |
| 'num' => 2, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = order a by $0, $1, $2; |
| c = limit b 100; |
| store c into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,3'], |
| }, |
| { |
| # Make sure that limit higher than number of rows doesn't mess stuff up |
| 'num' => 3, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = order a by $0, $1; |
| c = filter b by $1 < 1000; |
| d = limit c 100000; |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = distinct a; |
| c = limit b 100; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| a1 = foreach a generate $0, $1; |
| b1 = foreach b generate $0, $1; |
| c = union a1, b1; |
| d = limit c 100; |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| B = limit A 40; |
| C = filter B by age == 40; |
| D = group C by name; |
| E = foreach D generate group, COUNT(C); |
| store E into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 7, |
| 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| B = group A by name; |
| C = foreach B { |
| C1 = limit A 10; |
| generate group, COUNT(C1); |
| } |
| store C into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 8, |
| 'pig' =>q\A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| B = group A by name; |
| C = foreach B { |
| C1 = filter A by age < 40; |
| C2 = limit C1 10; |
| generate group, COUNT(C2); |
| } |
| D = filter C by $1 > 0; |
| store D into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 9, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = order a by $0, $1, $2; |
| c = limit b 1000/10; |
| store c into ':OUTPATH:';\, |
| |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = order a by $0, $1, $2; |
| c = limit b 100; |
| store c into ':OUTPATH:';\, |
| |
| 'sortArgs' => ['-t', ' ', '-k', '1,2'], |
| }, |
| { |
| 'num' => 10, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = group a all; |
| c = foreach b generate COUNT(a) as count; |
| d = limit a c.count/10; |
| store d into ':OUTPATH:';\, |
| |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = limit a 1000; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 11, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = group a all; |
| c = foreach b generate COUNT(a) as count; |
| d = load ':INPATH:/singlefile/votertab10k'; |
| e = group d all; |
| f = foreach e generate COUNT(d) as count; |
| d = limit a c.count/10+f.count/10; |
| store d into ':OUTPATH:';\, |
| |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = limit a 2000; |
| store b into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Split', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| split a into a1 if $0 > 'm', a2 if $0 <= 'm'; |
| store a1 into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| split a into a1 if $0 > 'm', a2 if $0 <= 'm'; |
| store a2 into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; |
| split a into a1 if $0 > 'm', a2 if $0 <= 'm'; |
| b = cogroup a1 by $1, a2 by $1; |
| c = foreach b generate flatten(a1), flatten(a2); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; |
| split a into a1 if $0 > 'm', a2 if $0 <= 'm'; |
| b = cogroup a1 by $1, a2 by $1; |
| c = foreach b generate flatten($1), flatten($2); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| split a into a1 if name > 'm', a2 if name <= 'm'; |
| b = distinct a1; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name, age, gpa); |
| split a into a1 if age > 50, a2 if age <= 25; |
| b = order a2 by name; |
| store b into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' ', '-k', '1,1'], |
| }, |
| { |
| 'num' => 7, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| split a into a1 if name > 'm', a2 if age < 50; |
| b = distinct a1; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 8, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| split a into a1 if age > 50, a2 if name < 'm'; |
| b2 = foreach a2 generate name, 1; |
| b1 = foreach a1 generate name, 2; |
| c = cogroup b2 by name, b1 by name; |
| d = foreach c generate flatten(group), COUNT($1), COUNT($2); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 9, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| split a into a1 if age > 50, a2 if name < 'm'; |
| b2 = distinct a2; |
| b1 = order a1 by name; |
| c = cogroup b2 by name, b1 by name; |
| d = foreach c generate flatten(group), COUNT($1), COUNT($2); |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 10, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| split a into a1 if age > 50, a2 otherwise; |
| store a1 into ':OUTPATH:.1'; |
| store a2 into ':OUTPATH:.2';\, |
| 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| split a into a1 if age > 50, a2 if age<=50; |
| store a1 into ':OUTPATH:.1'; |
| store a2 into ':OUTPATH:.2';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'ImplicitSplit', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = filter a by $1 > 50; |
| c = filter a by $2 > 3.0; |
| d = cogroup b by $0, c by $0; |
| e = foreach d generate flatten(b), flatten(c); |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' =>q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = filter a by age > 50; |
| c = filter a by gpa > 3.0; |
| d = cogroup b by name, c by name; |
| e = foreach d generate flatten(b), flatten(c); |
| f = filter e by b::age < 75; |
| store f into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'describe', |
| 'tests' => [ |
| #JIRA[PIG-373] |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| describe A; |
| store A into ':OUTPATH:';\, |
| |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| A = load 'sample' as (line:chararray); |
| B = foreach A generate flatten(STRSPLIT(line)) as (i0, i1, i2); |
| describe B;\, |
| 'expected_out_regex' => 'B: {i0: bytearray,i1: bytearray,i2: bytearray}', |
| }, |
| ], |
| }, |
| { |
| 'name' => 'Sample', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| S = sample A 2-1-1; |
| store S into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| S = sample A 0; |
| store S into ':OUTPATH:';\, |
| |
| |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| B = group A all; |
| C = foreach B generate COUNT(A) as count; |
| D = group A all; |
| E = foreach D generate (double)COUNT(A) as count; |
| S = sample A E.count/C.count; |
| store S into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| S = sample A 1; |
| store S into ':OUTPATH:';\, |
| }, |
| ], |
| }, |
| { |
| 'name' => 'MissingColumns', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age: int, gpa: double, extra: chararray); |
| B = filter A by age > 50 or extra > 'm'; |
| D = order B by age, extra; |
| store D into ':OUTPATH:';\, |
| |
| 'sortArgs' => ['-t', ' ', '-k', '2n,2n', '-k', '4,4'], |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage(); |
| B = foreach A generate $0, $1 + 1, $3 + 1; |
| C = group B by ($0, $2); |
| D = foreach C generate flatten(group), COUNT($1); |
| store D into ':OUTPATH:';\, |
| |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: double); |
| B = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa, extra1, extra2); |
| C = join A by (name, age), B by (name, extra1); |
| store C into ':OUTPATH:';\, |
| |
| # The following SQL should produce empty results, which will match what our pig query should produce. |
| } |
| ], |
| }, |
| { |
| 'name' => 'Aliases', |
| # check access of a field using multiple valid aliases |
| 'tests' => [ |
| { |
| # check that a free standing alias reference works |
| # when it is unambiguous |
| # check that a fully qualified alias reference works |
| # check that a partially qualified unambiguous alias reference works |
| 'num' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| b = group a by name; |
| c = foreach b generate flatten(a); |
| d = filter c by name != 'fred'; |
| e = group d by name; |
| f = foreach e generate flatten(d); |
| g = foreach f generate name, d::a::name as dname, a::name as aname; |
| store g into ':OUTPATH:';\, |
| |
| }, |
| { |
| # check that the "group" alias is available |
| # after a flatten(group) |
| 'num' => 2, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| b = group a by name; |
| c = foreach b generate flatten(group), COUNT(a) as cnt; |
| d = foreach c generate group; |
| store d into ':OUTPATH:';\, |
| |
| }, |
| ], |
| }, |
| |
| { |
| 'name' => 'Lineage', |
| #test if the right cast function is picked |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence); |
| c = cogroup a ALL, b ALL; |
| d = foreach c generate flatten(a), flatten(b); |
| e = foreach d generate name, flatten(TOKENIZE((chararray)sentence)) as sentence; |
| f = foreach e generate CONCAT((chararray)name, sentence); |
| store f into ':OUTPATH:';\, |
| |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa: double); |
| b = load ':INPATH:/singlefile/textdoc' using TextLoader() as (sentence); |
| c = cross a, b; |
| d = foreach c generate name, flatten(TOKENIZE((chararray)sentence)) as sentence; |
| e = foreach d generate CONCAT((chararray)name, sentence); |
| store e into ':OUTPATH:';\, |
| |
| }, |
| { |
| 'num' => 3, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa: double); |
| b = foreach a generate age as student_age; |
| c = filter b by student_age > 50; |
| d = foreach c generate student_age + 10; |
| store d into ':OUTPATH:';\, |
| |
| }, |
| { |
| 'num' => 4, |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = filter a by name lt 'b'; |
| c = foreach b generate org.apache.pig.test.udf.evalfunc.CreateMap((chararray)name, (int)age); |
| d = foreach c generate $0#'alice young'; |
| split d into e if $0 is not null, f if $0 is null; |
| store e into ':OUTPATH:';\, |
| } |
| ], |
| }, |
| { |
| 'name' => 'Casts', |
| 'tests' => [ |
| { |
| # check that a cast of a value of type |
| # same as the result type of the cast works |
| # when the value is treated as a bytearray |
| 'num' => 1, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| b = foreach a generate name, age, gpa; |
| store b into ':OUTPATH:.intermediate' using BinStorage(); |
| c = load ':OUTPATH:.intermediate' using BinStorage(); |
| -- after this load, the fields are treated as bytearrays though |
| -- they are actually "typed", test that the implicit casts |
| -- introduced by the operations in the foreach below will work fine |
| d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2; |
| store d into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| { |
| # check that a cast of a value of type |
| # same as the result type of the cast works |
| # when the value is treated as a bytearray |
| 'num' => 2, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); |
| b = foreach a generate name, age, gpa; |
| store b into ':OUTPATH:.intermediate' using BinStorage(); |
| c = load ':OUTPATH:.intermediate' using BinStorage(); |
| -- after this load, the fields are treated as bytearrays though |
| -- they are actually "typed", test that the implicit casts |
| -- introduced by the operations in the foreach below will work fine |
| d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f; |
| store d into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| { |
| #check that a cast of a value of type |
| #same as the result type of the cast works |
| #when the value is treated as a bytearray |
| 'num' => 3, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); |
| b = group a by name; |
| c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2']; |
| -- store the bag, tuple and map |
| store c into ':OUTPATH:.intermediate' using BinStorage(); |
| d = load ':OUTPATH:.intermediate' using BinStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]); |
| -- after this load, the fields are treated as bytearrays though |
| -- they are actually "typed", test that the implicit casts |
| -- introduced by the operations in the foreach below will work fine |
| e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2'; |
| store e into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| { |
| # check that a cast of a value of type |
| # same as the result type of the cast works |
| # when the value is treated as a bytearray |
| 'num' => 4, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| b = foreach a generate name, age, gpa; |
| store b into ':OUTPATH:.intermediate' using PigStorage(); |
| c = load ':OUTPATH:.intermediate' using PigStorage(); |
| -- after this load, the fields are treated as bytearrays though |
| -- they are actually "typed", test that the implicit casts |
| -- introduced by the operations in the foreach below will work fine |
| d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1, $2 + 0.2; |
| store d into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| { |
| # check that a cast of a value of type |
| # same as the result type of the cast works |
| # when the value is treated as a bytearray |
| 'num' => 5, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); |
| b = foreach a generate name, age, gpa; |
| store b into ':OUTPATH:.intermediate' using PigStorage(); |
| c = load ':OUTPATH:.intermediate' using PigStorage(); |
| -- after this load, the fields are treated as bytearrays though |
| -- they are actually "typed", test that the implicit casts |
| -- introduced by the operations in the foreach below will work fine |
| d = foreach c generate CONCAT((chararray)$0, 'test'), $1 + 1L, $2 + 0.2f; |
| store d into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| { |
| #check that a cast of a value of type |
| #same as the result type of the cast works |
| #when the value is treated as a bytearray |
| 'num' => 6, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa: float); |
| b = group a by name; |
| c = foreach b generate a, (1,2,3), ['key1'#'value1','key2'#'value2']; |
| -- store the bag, tuple and map |
| store c into ':OUTPATH:.intermediate' using PigStorage(); |
| d = load ':OUTPATH:.intermediate' using PigStorage() as (b:bag{t:tuple(x,y,z)}, t2:tuple(a,b,c), m:map[]); |
| -- after this load, the fields are treated as bytearrays though |
| -- they are actually "typed", test that the implicit casts |
| -- introduced by the operations in the foreach below will work fine |
| e = foreach d generate COUNT(b), t2.a, t2.b, t2.c, m#'key1', m#'key2'; |
| store e into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name, age, gpa, instate); |
| b = foreach a generate (boolean)instate; |
| c = filter b by instate == true; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate instate; |
| c = filter b by instate == 'true'; |
| store c into ':OUTPATH:';\, |
| } |
| ], |
| }, |
| { |
| 'name' => 'ClassResolution', |
| 'tests' => [ |
| { |
| # check that Loader specified without a package |
| # name works if that package name is specified |
| # in udf.import.list |
| 'num' => 1, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'java_params' => ['-Dudf.import.list=org.apache.pig.test.udf.storefunc'], |
| 'pig' => q\ |
| register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa: double); |
| b = foreach a generate CONCAT('(', name), CONCAT((chararray)age, ' )'); |
| store b into ':OUTPATH:.intermediate' using PigStorage(','); |
| c = load ':OUTPATH:.intermediate' using DumpLoader(); |
| store c into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| ], |
| }, |
| |
| { |
| 'name' => 'MergeJoin', |
| 'tests' => [ |
| # Simplest merge-join. |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2'; |
| g = join e by $0, f by $0 using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| g = join a by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| # Merge-join with left-side filter |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| h = filter e by $1 > 30; |
| f = load ':OUTPATH:.intermediate2'; |
| g = join h by $0, f by $0 using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| h = filter a by $1 > 30; |
| g = join h by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| |
| }, |
| # Merge-join with right-side filter |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2'; |
| i = filter f by $2 != 'democrat'; |
| g = join e by $0, i by $0 using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| i = filter b by $2 != 'democrat'; |
| g = join a by $0, i by $0; |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-join with schemas |
| { |
| 'num' => 4, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float); |
| f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float); |
| g = join e by $0, f by $0 using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| g = join a by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-join with key as expression |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0,$1; |
| d = order b by $0,$1; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2'; |
| g = join e by ($0,$1), f by ($0,$1) using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| g = join a by ($0,$1), b by ($0,$1); |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-join with key as expression This expression guarantees ordering |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $1; |
| d = order b by $1; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2'; |
| g = join e by ($1+10), f by ($1+10) using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| g = join a by ($1+10), b by ($1+10) ; |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-join with nulls in keys and data. |
| { |
| 'num' => 7, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = load ':INPATH:/singlefile/voternulltab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2'; |
| g = join e by $0, f by $0 using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = load ':INPATH:/singlefile/voternulltab10k'; |
| g = join a by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-join with one file across multiple blocks |
| { |
| 'num' => 8, |
| 'execonly' => 'mapred,tez', # since this join will run out of memory in local mode |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k'; |
| b = load ':INPATH:/singlefile/studenttab20m'; |
| h = filter b by $2 < 1.5; |
| c = order a by $0; |
| d = order h by $0 parallel 1; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, reg:chararray, contrib:float); |
| f = load ':OUTPATH:.intermediate2'as (name:chararray, age:int, gpa:float); |
| g = join e by $0, f by $0 using 'merge'; |
| i = filter g by $2 == 'democrat' and $1 > 76; |
| store i into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k'; |
| b = load ':INPATH:/singlefile/studenttab20m'; |
| h = filter b by $2 < 1.5; |
| g = join a by $0, h by $0; |
| i = filter g by $2 == 'democrat' and $1 > 76; |
| store i into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-join with join on numeric key |
| { |
| 'num' => 9, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| c = order a by age; |
| d = order b by age; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2'; |
| exec; |
| e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float); |
| f = load ':OUTPATH:.intermediate2' as (name:chararray, age:int, reg:chararray, contrib:float); |
| g = join e by age, f by age using 'merge'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| g = join a by age, b by age; |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| |
| ] |
| }, |
| { |
| 'name' => 'SkewedJoin', |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name, b by name; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| |
| # basic join with no skewed keys |
| { |
| 'num' => 2, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=10000'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, |
| gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name, b by name using 'skewed'; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' =>q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, |
| gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name, b by name ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| |
| # join after filtering |
| { |
| 'num' => 3, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, |
| gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by $0, d by $0 using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, |
| gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by $0, d by $0 ; |
| store e into ':OUTPATH:';\, |
| }, |
| |
| # join by two columns |
| { |
| 'num' => 4, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=3'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by (name, age), d by (name, age) using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by (name, age), d by (name, age) ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| |
| # join with add |
| { |
| 'num' => 5, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=50'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by age+10, d by age + 20 using 'skewed' parallel 10; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray,age:int, gpa:double); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by age+10, d by age + 20 ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| |
| # join with split |
| { |
| 'num' => 6, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = filter a by $1 > 25; |
| c = join a by $0, b by $0 using 'skewed' parallel 7; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = filter a by $1 > 25; |
| c = join a by $0, b by $0 ; |
| store c into ':OUTPATH:';\, |
| |
| }, |
| |
| # join with UDF |
| { |
| 'num' => 7, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=20'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by SIZE(name), d by SIZE(name) using 'skewed' parallel 7; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| e = join c by SIZE(name), d by SIZE(name) ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # left outer join |
| { |
| 'num' => 8, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name left outer, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name left outer, b by name ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # right outer join |
| { |
| 'num' => 9, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name right outer, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name right outer, b by name ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # full outer join |
| { |
| 'num' => 10, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name full outer, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name full outer, b by name ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # right outer join with fixed memory |
| { |
| 'num' => 11, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100 -Dpig.skewedjoin.reduce.mem=516947966'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name right outer, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = join a by name right outer, b by name ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # full outer join with empty left relation |
| { |
| 'num' => 12, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = filter a by name=='abc'; |
| e = join b by name right outer, a by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate (null, null, null, name, age, gpa); |
| c = foreach b generate flatten($0); |
| store c into ':OUTPATH:';\, |
| |
| }, |
| # left outer join with fixed memory |
| { |
| 'num' => 13, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100 -Dpig.skewedjoin.reduce.mem=516947966'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| b = filter b by name < 'b'; |
| e = join a by name left outer, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| b = filter b by name < 'b'; |
| e = join a by name left outer, b by name ; |
| store e into ':OUTPATH:';\, |
| }, |
| # full outer join with fixed memory |
| { |
| 'num' => 14, |
| 'java_params' => ['-Dpig.skewedjoin.reduce.maxtuple=100 -Dpig.skewedjoin.reduce.mem=516947966'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| b = filter b by name > 'm'; |
| e = join a by name full outer, b by name using 'skewed' parallel 8; |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| b = filter b by name > 'm'; |
| e = join a by name full outer, b by name ; |
| store e into ':OUTPATH:';\, |
| |
| }, |
| ] |
| |
| }, |
| |
| { |
| 'name' => 'CollectedGroup', |
| 'tests' => [ |
| # Simplest collected group. |
| { |
| 'num' => 1, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = order a by $0; |
| store b into ':OUTPATH:.intermediate'; |
| exec; |
| register :FUNCPATH:/testudf.jar; |
| c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader(); |
| d = group c by $0 using 'collected'; |
| e = foreach d generate group, COUNT(c); |
| store e into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| d = group a by $0 ; |
| e = foreach d generate group, COUNT(a); |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # Collected group with filter |
| { |
| 'num' => 2, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = order a by $0; |
| store b into ':OUTPATH:.intermediate'; |
| exec; |
| register :FUNCPATH:/testudf.jar; |
| c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader(); |
| d = filter c by $1 > 30; |
| e = group d by $0 using 'collected'; |
| f = foreach e generate group, COUNT(d); |
| store f into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| d = filter a by $1 > 30; |
| e = group d by $0 ; |
| f = foreach e generate group, COUNT(d); |
| store f into ':OUTPATH:';\, |
| |
| }, |
| # Collected group with schemas |
| { |
| 'num' => 3, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = order a by $0; |
| store b into ':OUTPATH:.intermediate'; |
| exec; |
| register :FUNCPATH:/testudf.jar; |
| c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); |
| d = group c by $0 using 'collected'; |
| e = foreach d generate group, MAX(c.age); |
| store e into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| d = group a by $0 ; |
| e = foreach d generate group, MAX(a.$1); |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # Collected group with multiple columns |
| { |
| 'num' => 4, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = order a by name, age; |
| store b into ':OUTPATH:.intermediate'; |
| exec; |
| register :FUNCPATH:/testudf.jar; |
| c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); |
| d = group c by (name, age) using 'collected'; |
| e = foreach d generate group.name, group.age, MIN(c.gpa); |
| store e into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| d = group a by (name, age) ; |
| e = foreach d generate group.name, group.age, MIN(a.gpa); |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # Collected group with nulls in keys and data. |
| { |
| 'num' => 5, |
| 'pig' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = order a by $0; |
| store b into ':OUTPATH:.intermediate'; |
| exec; |
| register :FUNCPATH:/testudf.jar; |
| c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); |
| d = group c by $0 using 'collected'; |
| e = foreach d generate group, SUM(c.$1); |
| store e into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float); |
| d = group a by $0 ; |
| e = foreach d generate group, SUM(a.$1); |
| store e into ':OUTPATH:';\, |
| |
| }, |
| # Collected group with numeric key |
| { |
| 'num' => 6, |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = order a by age; |
| store b into ':OUTPATH:.intermediate'; |
| exec; |
| register :FUNCPATH:/testudf.jar; |
| c = load ':OUTPATH:.intermediate' using org.apache.pig.test.udf.storefunc.SimpleCollectableLoader() as (name:chararray, age:int, gpa:float); |
| d = group c by age using 'collected'; |
| e = foreach d generate group, AVG(c.gpa), COUNT(c.name); |
| store e into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| d = group a by age ; |
| e = foreach d generate group, AVG(a.gpa), COUNT(a.name); |
| store e into ':OUTPATH:';\, |
| |
| }, |
| ] |
| }, |
| { |
| 'name' => 'SecondarySort', |
| 'tests' => [ |
| { |
| # simple order by |
| 'num' => 1, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by name; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # order by desc |
| 'num' => 2, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by name desc; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # order by float type |
| 'num' => 3, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by gpa; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| # order by string type |
| { |
| 'num' => 4, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by name; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| # simple distinct |
| { |
| 'num' => 5, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = a.name; |
| e = distinct d; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(e); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| # distinct on tuple |
| { |
| 'num' => 6, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = distinct a; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| # sort by two columns |
| { |
| 'num' => 7, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by gpa, name desc; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.gpa), org.apache.pig.test.udf.evalfunc.AllFirstLetter(d.name); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| # sort, distinct mix |
| { |
| 'num' => 8, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by name; |
| e = d.gpa; |
| f = distinct e; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| # sort, distinct mix |
| { |
| 'num' => 9, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = group a by age parallel 10; |
| c = foreach b { |
| d = order a by gpa; |
| e = d.gpa; |
| f = distinct e; |
| generate group, org.apache.pig.test.udf.evalfunc.AllFirstLetter(f); |
| }; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # secondary sort boolean |
| 'num' => 10, |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = group a by age; |
| c = foreach b { |
| d = order a by instate; |
| generate group, flatten(d); |
| }; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = group a by age; |
| c = foreach b { |
| d = order a by instate; |
| generate group, flatten(d); |
| }; |
| store c into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Accumulator', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| e = cogroup a by name, b by name parallel 8; |
| f = foreach e generate group, SUM(a.age) as s; |
| g = filter f by s>0; |
| store g into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| e = group a by name parallel 8; |
| f = foreach e generate group, COUNT(a), MAX(a.contributions), MIN(a.contributions) ; |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 3, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| e = group a by name parallel 8; |
| f = foreach e generate group, (MAX(a.contributions)-MIN(a.contributions))*COUNT(a) ; |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 4, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| e = group a by name parallel 8; |
| f = foreach e { g = distinct a.age; generate group, COUNT(g);} |
| store f into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 5, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=1'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| register :FUNCPATH:/testudf.jar; |
| DEFINE YesAccumulate org.apache.pig.TestingAccumulatorHelper('false'); |
| DEFINE NoAccumulate org.apache.pig.TestingAccumulatorHelper('true'); |
| b = foreach (group a all) generate COUNT(a) as ct, YesAccumulate(a) as yes_acc, NoAccumulate(a) as no_acc; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| b = foreach (group a all) generate COUNT(a) as ct; |
| c = foreach b generate ct, ct as ct2, 0; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 6, |
| 'java_params' => ['-Dpig.exec.nocombiner=true'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| register :FUNCPATH:/testudf.jar; |
| DEFINE YesAccumulate org.apache.pig.TestingAccumulatorHelper('false'); |
| DEFINE NoAccumulate org.apache.pig.TestingAccumulatorHelper('true'); |
| b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a) as ct, YesAccumulate(a) as yes_acc, NoAccumulate(a) as no_acc; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:int, registration, contributions); |
| register :FUNCPATH:/testudf.jar; |
| b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a) as ct; |
| c = foreach b generate ct, 1, 1; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 7, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions); |
| register :FUNCPATH:/testudf.jar; |
| b = foreach (group a all) generate COUNT(a), |
| org.apache.pig.test.udf.evalfunc.IteratingAccumulatorCount(a), |
| org.apache.pig.test.udf.evalfunc.IteratingAccumulatorSum(a.age), |
| org.apache.pig.test.udf.evalfunc.IteratingAccumulatorIsEmpty(a); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions); |
| b = foreach (group a all) generate COUNT(a), SUM(a.age), IsEmpty(a); |
| c = foreach b generate $0, *; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 8, |
| 'java_params' => ['-Dpig.exec.nocombiner=true', '-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions); |
| register :FUNCPATH:/testudf.jar; |
| b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a), |
| org.apache.pig.test.udf.evalfunc.IteratingAccumulatorCount(a), |
| org.apache.pig.test.udf.evalfunc.IteratingAccumulatorSum(a.age), |
| org.apache.pig.test.udf.evalfunc.IteratingAccumulatorIsEmpty(a); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age:long, registration, contributions); |
| register :FUNCPATH:/testudf.jar; |
| b = foreach (group a all) generate org.apache.pig.test.udf.evalfunc.NonAlgNonAccCount(a), COUNT(a), SUM(a.age), IsEmpty(a); |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'PruneColumns', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'execonly' => 'mapred,tez', # studenttab20m not available in local mode |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab20m' using PigStorage() as (name, age, gpa); |
| b = foreach a generate age; |
| store b into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Bzip', |
| 'tests' => [ |
| { |
| # test reading and writing out files with .bz2 extension |
| # relying on Hadoop's bzipcodec (for 0.23/2.X and after) |
| 'num' => 1, |
| 'java_params' => ['-Dpig.bzip.use.hadoop.inputformat=true'], |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| store a into ':OUTPATH:.intermediate.bz2'; |
| b = load ':OUTPATH:.intermediate.bz2'; |
| store b into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| { |
| # test reading and writing with .bz extension |
| # relying on Hadoop's bzipcodec (for 0.23/2.X and after) |
| 'num' => 2, |
| 'java_params' => ['-Dpig.bzip.use.hadoop.inputformat=true'], |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| store a into ':OUTPATH:.intermediate.bz'; |
| b = load ':OUTPATH:.intermediate.bz'; |
| store b into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| { |
| # test reading and writing out files with .bz2 extension |
| # using Bzip2TextInputFormat. |
| 'num' => 3, |
| 'java_params' => ['-Dpig.bzip.use.hadoop.inputformat=false'], |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| store a into ':OUTPATH:.intermediate.bz2'; |
| b = load ':OUTPATH:.intermediate.bz2'; |
| store b into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| { |
| # test reading and writing with .bz extension |
| # using Bzip2TextInputFormat. |
| 'num' => 4, |
| 'java_params' => ['-Dpig.bzip.use.hadoop.inputformat=false'], |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| store a into ':OUTPATH:.intermediate.bz'; |
| b = load ':OUTPATH:.intermediate.bz'; |
| store b into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Scalar', |
| 'tests' => [ |
| { |
| # test scalar in foreach (most common) |
| 'num' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = group a all; |
| c = foreach b generate AVG(a.gpa) as avg, MAX(a.gpa) as max; |
| y = foreach a generate name, (gpa - c.avg) / c.max; |
| store y into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| # test scalar in filter |
| 'num' => 2, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = group a all; |
| c = foreach b generate AVG(a.gpa) as avg; |
| y = filter a by gpa > c.avg; |
| store y into ':OUTPATH:';\, |
| }, |
| { |
| # test scalar with two branch |
| 'num' => 3, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = group a all; |
| c = foreach b generate AVG(a.age) as avg; |
| x = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions); |
| y = filter x by age > c.avg; |
| store y into ':OUTPATH:';\, |
| }, |
| { |
| # test with scalar from two inputs |
| 'num' => 4, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = group a all; |
| c = foreach b generate AVG(a.age) as avg; |
| d = load ':INPATH:/singlefile/votertab10k' using PigStorage() as (name, age, registration, contributions); |
| e = group d all; |
| f = foreach e generate AVG(d.age) as avg; |
| y = foreach a generate age/c.avg, age/f.avg; |
| store y into ':OUTPATH:';\, |
| }, |
| { |
| # test scalar with split |
| 'num' => 5, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = group a all; |
| c = foreach b generate AVG(a.age) as avg, COUNT(a.age) as cnt; |
| d = foreach c generate avg; |
| e = group d by $0; |
| f = foreach e generate group, c.avg, c.cnt; |
| store f into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Scripting', |
| 'tests' => [ |
| { |
| # test integer square |
| 'num' => 1, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test string concat and referencing function without a namespace |
| 'num' => 2, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); |
| b = foreach a generate concat(name) as name; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = foreach a generate CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test long and double square, plus two references to the same UDF with different schemas |
| 'num' => 3, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double); |
| b = foreach a generate myfuncs.square(age), myfuncs.square(gpa); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age, gpa * gpa; |
| store b into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| # test method with no schema decorator (ie, returns bytearray) |
| 'num' => 4, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); |
| b = foreach a generate myfuncs.byteconcat(name); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test method with no schema decorator (ie, returns bytearray) |
| 'num' => 5, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb); |
| c = foreach b generate mm#'name', mt.$0, mb.$0; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = foreach a generate SIZE(m#'name'), t.$2, b.$2; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test null input and output |
| 'num' => 6, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test functions that call other functions and include other files |
| 'num' => 7, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.redirect(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 8, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| register ':SCRIPTHOMEPATH:/python/morepythonudfs.py' using jython as morefuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age), morefuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age, age * age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 9, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, myfuncs.count(a); |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, COUNT(a); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 10, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = foreach a generate name, myfuncs.adjustgpa(gpa, instate); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate name, (instate=='true'?gpa:gpa+1); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 11, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate name, myfuncs.isretired(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = foreach a generate name, (age>=60?'true':'false'); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # jython udf which returns an array |
| 'num' => 12, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/python/scriptingudf.py' using jython as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray); |
| b = foreach a generate CONCAT(CONCAT(age, ' '), gpa) as sentence; |
| c = foreach b generate flatten(myfuncs.tokenize(sentence)); |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray); |
| b = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray); |
| c = foreach a generate age; |
| d = foreach b generate gpa; |
| e = union c, d; |
| store e into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'RubyUDFs', |
| 'tests' => [ |
| { |
| # test integer square |
| 'num' => 1, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test string concat and referencing function without a namespace |
| 'num' => 2, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); |
| b = foreach a generate myfuncs.concat(name, name); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = foreach a generate CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test long and double square, plus two references to the same UDF with different schemas |
| 'num' => 3, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double); |
| b = foreach a generate myfuncs.square(age), myfuncs.square(gpa); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age, gpa * gpa; |
| store b into ':OUTPATH:';\, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| }, |
| { |
| # test method with no schema decorator (ie, returns bytearray) |
| 'num' => 4, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate myfuncs.byteconcat(name, name); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test method with complex types |
| 'num' => 5, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb); |
| c = foreach b generate mm#'name', mt.$0, mb.$0; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = foreach a generate SIZE(m#'name'), t.$2, b.$2; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test null input and output |
| 'num' => 6, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test functions that call other functions and include other files |
| 'num' => 7, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.redirect(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 8, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as morefuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age), morefuncs.cube(age), morefuncs.CUBE(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age, age * age * age, age * age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test algebraic functions |
| 'num' => 9, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, myfuncs.Count(a); |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, COUNT(a); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # test accumulator functions |
| 'num' => 10, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/scriptingudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, myfuncs.Sum(a.age), myfuncs.Sum(a.gpa); |
| d = foreach c generate $0, $1, (double)(ROUND($2*100))/100; |
| store d into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, SUM(a.age), SUM(a.gpa); |
| d = foreach c generate $0, $1, (double)(ROUND($2*100))/100; |
| store d into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 11, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate flatten(myfuncs.reverse(name, age)); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age, name; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 12, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = filter a by myfuncs.ISEVEN(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = filter a by age%2==0; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 13, |
| 'java_params' => ['-Dpig.accumulative.batchsize=5'], |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/ruby/morerubyudfs.rb' using jruby as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach (group a all){ |
| a1= order a by name,age,gpa; |
| generate FLATTEN(myfuncs.AppendIndex(a1)); |
| } |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach (group a all){ |
| a1=order a by name,age,gpa; |
| generate FLATTEN(org.apache.pig.test.udf.evalfunc.AppendIndex(a1)); |
| } |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'JavaScriptUDFs', |
| 'tests' => [ |
| { |
| # test double square |
| 'num' => 1, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/js/scriptingudf.js' using javascript as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(gpa); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate gpa * gpa; |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'GroovyUDFs', |
| 'tests' => [ |
| { |
| # test integer square |
| 'num' => 1, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/groovy/scriptingudf.groovy' using groovy as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'StreamingPythonUDFs', |
| 'tests' => [ |
| { |
| # test integer square |
| 'num' => 1, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test string concat and referencing function without a namespace |
| 'num' => 2, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); |
| b = foreach a generate concat(name) as name; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = foreach a generate CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test long and float square, plus two references to the same UDF with different schemas |
| 'num' => 3, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:long, gpa:double); |
| b = foreach a generate myfuncs.square(age) as age:long, myfuncs.squareDouble(gpa) as gpa:double; |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age, ((double) ROUND((gpa * gpa)*10000)) / 10000.0; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test bytearray |
| 'num' => 4, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age, gpa); |
| b = foreach a generate myfuncs.byteconcat(name); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate CONCAT(name, name); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test complex types |
| 'num' => 5, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = foreach a generate flatten(myfuncs.complexTypes(m, t, b)) as (mm, mt, mb); |
| c = foreach b generate mm#'name', mt.$0, mb.$0; |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = foreach a generate SIZE(m#'name'), t.$2, b.$2; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test null input and output |
| 'num' => 6, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentnulltab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test functions that call other functions and include other files |
| 'num' => 7, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.redirect(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 8, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| register ':SCRIPTHOMEPATH:/cpython/morepythonudfs.py' using streaming_python as morefuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate myfuncs.square(age), morefuncs.square(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = foreach a generate age * age, age * age * age; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 9, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, myfuncs.count(a); |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa:double); |
| b = group a by name; |
| c = foreach b generate group, COUNT(a); |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 10, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/allscalar10k' as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = foreach a generate name, myfuncs.adjustgpa(gpa, instate); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = foreach a generate name, ((double)ROUND((instate=='true'?gpa:gpa+1)*10000)) / 10000.0; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test that functions with same names resolve correctly across name spaces |
| 'num' => 11, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| b = foreach a generate name, myfuncs.isretired(age); |
| store b into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:double); |
| b = foreach a generate name, (age>=60?'true':'false'); |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # udf which returns an array |
| 'num' => 12, |
| 'pig' => q\ |
| register ':SCRIPTHOMEPATH:/cpython/scriptingudf.py' using streaming_python as myfuncs; |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray); |
| b = foreach a generate CONCAT(CONCAT(age, ' '), gpa) as sentence; |
| c = foreach b generate flatten(myfuncs.tokenize(sentence)); |
| store c into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray); |
| b = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray, gpa:chararray); |
| c = foreach a generate age; |
| d = foreach b generate gpa; |
| e = union c, d; |
| store e into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Native', |
| 'tests' => [ |
| { |
| # test common |
| 'num' => 1, |
| 'pig' => q\ |
| rmf table_testNativeMRJobSimple_input |
| rmf table_testNativeMRJobSimple_output |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = native ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; |
| store b into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\ |
| rmf table_testNativeMRJobSimple_input |
| rmf table_testNativeMRJobSimple_output |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; |
| store b into ':OUTPATH:';\, |
| }, |
| { |
| # test complex |
| 'num' => 2, |
| 'pig' => q\ |
| rmf table_testNativeMRJobSimple_input |
| rmf table_testNativeMRJobSimple_output |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| c = distinct b; |
| d = native ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; |
| e = order d by name; |
| store e into ':OUTPATH:';\, |
| 'sortArgs' => ['-t', ' '], |
| 'notmq' => 1, |
| 'verify_pig_script' => q\ |
| rmf table_testNativeMRJobSimple_input |
| rmf table_testNativeMRJobSimple_output |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = foreach a generate name; |
| c = distinct b; |
| d = mapreduce ':MAPREDJARS:/hadoop-examples.jar' Store c into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `wordcount table_testNativeMRJobSimple_input table_testNativeMRJobSimple_output`; |
| e = order d by name; |
| store e into ':OUTPATH:';\, |
| }, |
| { |
| # test streaming |
| 'num' => 3, |
| 'pig' => q\ |
| rmf table_testNativeMRJobSimple_input |
| rmf table_testNativeMRJobSimple_output |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = mapreduce ':MAPREDJARS:/hadoop-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper cat -reducer wc`; |
| store b into ':OUTPATH:';\, |
| 'pig23' => q\ |
| rmf table_testNativeMRJobSimple_input |
| rmf table_testNativeMRJobSimple_output |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = mapreduce ':MAPREDJARS:/hadoop-0.23.0-streaming.jar' Store a into 'table_testNativeMRJobSimple_input' Load 'table_testNativeMRJobSimple_output' as (name:chararray, count: int) `-input table_testNativeMRJobSimple_input -output table_testNativeMRJobSimple_output -mapper cat -reducer wc`; |
| store b into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'Partitioner', |
| 'tests' => [ |
| { |
| # test group |
| 'num' => 1, |
| 'execonly' => 'mapred,tez', # since this join will run out of memory in local mode |
| 'pig' => q\register :FUNCPATH:/testudf.jar; |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age:int, gpa); |
| b = group a by age PARTITION BY org.apache.pig.test.utils.SimpleCustomPartitioner2 parallel 2; |
| c = foreach b generate group, COUNT(a); |
| store c into ':OUTPATH:';\, |
| 'java_params' => ['-Dpig.exec.mapPartAgg=false'] |
| }, |
| ] |
| }, |
| { |
| #################################################################### |
| # SUB : CastScalar |
| # FEATURE: adds functionality that allows to cast elements of a single-tuple relation into a scalar value. |
| # JIRA: Pig-1434 |
| # |
| # TEST ITEMS: |
| # 1 Test syntax |
| # 2 Test scalar for simple data type |
| # 3 Test scalar for complex data type: tuple, bag, map |
| # 4 Test implicit cast |
| # 5 Test explicit cast |
| # 6 Positional parameter |
| # 7 Cast within an aggregate function |
| # 8 Cast within an UDF function |
| # 9 Cast with a FOREACH |
| # 10 Cast with a FILTER |
| # 11 Cast with a SPLIT |
| # 12 Cast in a JOIN |
| # 13 Multiquery |
| # 14 Cast on a schema that cannot be inferred should result in bytearray |
| # 15 Replicated Join |
| # 16 Test operations such as R1 * (int)R1 |
| # 17 CheckSingular(*) |
| # 18 missing field in scalar file |
| # 19 scalar referenced from an empty file |
| # 20 empty input directory |
| # 21 Single row vs Multiple Row |
| # 22 Cast on a multi-field tuple |
| # 23 Reference a non-scalar as a scalar |
| # 24 Test multiple loaders |
| |
| 'name' => 'CastScalar', |
| 'tests' => [ |
| { |
| # 2 Test scalar for simple data type |
| # 3 Test scalar for complex data type: tuple, bag, map |
| # 9 Cast with a FOREACH |
| #INPATH = /user/hadoopqa/pig/tests/data |
| |
| 'num' => 1, |
| 'pig' => q# |
| a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| b = group a all; |
| c = foreach b generate SUM(a.age) as total; |
| d = foreach a generate name, age+(double)c.total as d_sum; |
| e = order d by name, d_sum; |
| store d into ':OUTPATH:'; |
| #, |
| # 6 Positional parameter |
| |
| }, { |
| |
| 'num' => 2, |
| 'pig' => q# |
| a = load ':INPATH:/singlefile/studenttab10k' as (name: chararray, age: int, gpa: float); |
| b = group a all; |
| c = foreach b generate SUM(a.age) as total; |
| d = foreach a generate name, age+(double)c.$0 as d_sum; |
| e = order d by name, d_sum; |
| store d into ':OUTPATH:'; |
| #, |
| # 2 Test scalar for simple data type |
| # 3 Test scalar for complex data type:map |
| # 9 Cast with a FOREACH |
| # 13 Multiquery |
| # 24 Test multiple loaders |
| #INPATH = /user/hadoopqa/pig/tests/data |
| |
| }, { |
| |
| # 4 Test implicit cast |
| # 10 Cast with a FILTER |
| # |
| # I set the benchmark to use "19" because pig trunkates during cast and sql rounds up. |
| 'num' => 7, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.gpa)+20 as avg_gpa; |
| d = order c by avg_gpa; |
| simple_scalar = limit d 1; |
| f = filter a by age < (int) simple_scalar.avg_gpa; |
| g = order f by name, age, gpa; |
| store g into ':OUTPATH:';\, |
| |
| }, { |
| # 5 Test explicit cast |
| # 10 Cast with a FILTER |
| 'num' => 8, |
| |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.age) AS average; |
| d = order c by average; |
| simple_scalar = limit d 1; |
| d = filter a by age > (int) simple_scalar.average; |
| e = foreach d generate name, age; |
| store e into ':OUTPATH:'; |
| \, |
| }, { |
| # 5 Test explicit cast |
| # 6 Positional parameter |
| 'num' => 9, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.age) AS average; |
| d = order c by average; |
| simple_scalar = limit d 1; |
| d = filter a by age > (int) simple_scalar.$1; |
| e = foreach d generate name, age; |
| store e into ':OUTPATH:'; |
| \, |
| }, { |
| # 4 Test implicit cast |
| # 6 Positional parameter |
| # 10 Cast with a FILTER |
| 'num' => 10, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.age) AS average; |
| d = order c by average; |
| simple_scalar = limit d 1; |
| d = filter a by age > simple_scalar.$1; |
| e = foreach d generate name, age; |
| store e into ':OUTPATH:'; |
| \, |
| }, { |
| # 4 Test implicit cast |
| # 6 Positional parameter |
| # 11 Cast with a SPLIT |
| 'num' => 11, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = group a by name; |
| c = foreach b generate group, AVG(a.age) AS average; |
| d = order c by average; |
| simple_scalar = limit d 1; |
| split a into X1 if age > (int) simple_scalar.$1, X2 if age < 20; |
| split a into X3 if age > (int) simple_scalar.$1, X4 if age > 70; |
| |
| store X1 into ':OUTPATH:.1'; |
| store X2 into ':OUTPATH:.2'; |
| store X3 into ':OUTPATH:.3'; |
| store X4 into ':OUTPATH:.4'; |
| \, |
| }, { |
| # 4 Test implicit cast |
| # 6 Positional parameter |
| # 12 Cast with a JOIN |
| 'num' => 12, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name, age, gpa); |
| b = load ':INPATH:/singlefile/votertab10k' as (name, age, registration, contributions); |
| c = filter a by age < 20; |
| d = filter b by age < 20; |
| simple_scalar = limit d 1; |
| e = join c by name, d by name; |
| f= filter e by c::age <(int)simple_scalar.age; |
| store f into ':OUTPATH:';\, |
| }, |
| ] |
| |
| },{ |
| |
| 'name' => 'udf_TOBAGandTOTUPLE', |
| |
| 'sortResults' => 1, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'tests' => [ |
| { |
| # TEST : resulting schema for TOBAG/TOTUPLE with simple types |
| # TEST : resulting schema for TOBAG/TOTUPLE with positional parameters |
| # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE and standard projections |
| # TEST : resulting schema for various projects using a combination of TOBAG/TOTUPLE using AS clause |
| |
| 'num' => 1 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| Gen1 = FOREACH B GENERATE $0, $1, $2 ; |
| GroupById = GROUP B BY id; |
| |
| B1 = foreach B generate TOBAG( intnum1000, id, intnum5); |
| B3 = foreach B generate TOBAG( $0, $1, $2); |
| T1= foreach B generate TOTUPLE( intnum1000, id, intnum5); |
| T2= foreach B generate TOTUPLE( $0, $1, $2); |
| T3 = foreach B generate TOTUPLE( $0, $0, $0); |
| T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7; |
| T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7; |
| T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate; |
| describe Gen1; |
| describe GroupById; |
| describe B1; |
| describe B3; |
| describe T1; |
| describe T2; |
| describe T3; |
| describe T4; |
| describe T5; |
| describe T6; |
| ? |
| ,'expected_out_regex' => 'B1: {{int}}' |
| ,'expected_out_regex' => 'B3: {{int}}' |
| ,'expected_out_regex' => 'T1: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}' |
| ,'expected_out_regex' => 'T2: {org.apache.pig.builtin.totuple_id_.*: (intnum1000: int,id: int,intnum5: int)}' |
| ,'expected_out_regex' => 'T3: {org.apache.pig.builtin.totuple_intnum1000.*: (intnum1000: int,intnum1000: int,intnum1000: int)}' |
| ,'expected_out_regex' => 'T4: {{int},org.apache.pig.builtin.totuple_intnum100.*: (intnum100: int,intnum: int,longnum: long),floatnum: float,doublenum: double}' |
| ,'expected_out_regex' => 'T5: {intnum1000: int,id: int,org.apache.pig.builtin.totuple_intnum100.*: (intnum5: int,intnum100: int,intnum: int).*{NULL}.*doublenum: double}' |
| ,'expected_out_regex' => "T6: {intnum1000: int,org.apache.pig.builtin.totuple_intnum1000.*: \\(intnum1000: int,intnum1000: int,intnum1000: int\\),{\\(int\\)},duplicate: int}" |
| }, { |
| # TEST : bag of mixed data types |
| # TEST : Order |
| # TEST : positional parameters |
| 'num' => 2 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| C = foreach A generate TOBAG( id, floatnum, doublenum ); |
| D = foreach A generate TOBAG( id, intnum); |
| E = foreach A generate TOBAG( (float) id,floatnum ); |
| F = foreach A generate TOBAG( (long) id,longnum ); |
| G = foreach A generate TOBAG( (double) id,doublenum ); |
| describe C; |
| describe D; |
| describe E; |
| describe F; |
| describe G; |
| ? |
| ,'expected_out_regex' => 'C: {{\\(NULL\\)}}' |
| ,'expected_out_regex' => 'D: {{\\(int\\)}}' |
| ,'expected_out_regex' => 'E: {{\\(float\\)}}' |
| ,'expected_out_regex' => 'F: {{\\(long\\)}}' |
| ,'expected_out_regex' => 'G: {{\\(double\\)}}' |
| |
| }, { |
| # TEST : TOBAG/TOTUPLE with simple types |
| # TEST : TOBAG/TOTUPLE with positional parameters |
| # TEST : various projects using a combination of TOBAG/TOTUPLE and standard projections |
| # TEST : various projects using a combination of TOBAG/TOTUPLE using AS clause |
| 'num' => 3 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| B1 = foreach B generate TOBAG( intnum1000, id, intnum5); |
| B2 = foreach B generate TOBAG( $0, $1, $2); |
| T1= foreach B generate TOTUPLE( intnum1000, id, intnum5); |
| T2= foreach B generate TOTUPLE( $0, $1, $2); |
| T3 = foreach B generate TOTUPLE( $0, $0, $0); |
| T4= foreach B generate TOBAG($0, $1, $2), TOTUPLE($3, $4, $5), $6, $7; |
| T5= foreach B generate $0, $1, TOTUPLE($2, $3, $4), TOBAG($5, $6), $7; |
| T6= foreach B generate $0, TOTUPLE($0, $0, $0), TOBAG($0, $0), $0 AS duplicate; |
| Gen1 = FOREACH B GENERATE $0, $1, $2 ; |
| GroupById = GROUP B BY id; |
| store Gen1 into ':OUTPATH:.1'; |
| store GroupById into ':OUTPATH:.2'; |
| store B1 into ':OUTPATH:.3'; |
| store B2 into ':OUTPATH:.4'; |
| store T1 into ':OUTPATH:.5'; |
| store T2 into ':OUTPATH:.6'; |
| store T3 into ':OUTPATH:.7'; |
| store T4 into ':OUTPATH:.8'; |
| ? |
| }, { |
| # TEST : cast for TOTUPLE/TOBAG |
| 'num' => 4 |
| ,'ignore' => 1 # different error message for different version of hadoop |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B= limit A 10; |
| C = foreach B generate $0, TOTUPLE((int) $0, (long) $0, (double) $0), TOBAG( (float) $0, (chararray) $0), $0; |
| store C into ':OUTPATH:'; |
| ? |
| ,'expected_err_regex' => 'ERROR 1108: Duplicate schema alias' |
| ,'rc' => 6 |
| |
| }, { |
| # TEST : cast for TOTUPLE/TOBAG |
| 'num' => 5 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B= limit A 1; |
| C = foreach B generate $0, TOTUPLE((int) $0); |
| D = foreach B generate $0, TOTUPLE((long) $0); |
| E = foreach B generate $0, TOTUPLE((double) $0); |
| F = foreach B generate $0, TOTUPLE((float) $0); |
| G = foreach B generate $0, TOTUPLE((chararray) $0); |
| store B into ':OUTPATH:.1'; |
| store C into ':OUTPATH:.2'; |
| store D into ':OUTPATH:.3'; |
| store E into ':OUTPATH:.4'; |
| store F into ':OUTPATH:.5'; |
| store G into ':OUTPATH:.6'; |
| ? |
| }, { |
| |
| #TEST more complicated nested functions such as TOTUPLE(TOBAG()) |
| #TEST more complicated nested functions such as TOBAG(TOTUPLE()) |
| #TEST more complicated nested functions such as TOTUPLE(TOTUPLE()) |
| #TEST more complicated nested functions such as TOBAG(TOBAG()) |
| 'num' => 6 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| tint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) ); |
| bint = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); |
| binb = foreach B generate TOBAG( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); |
| tinb = foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); |
| store B into ':OUTPATH:.1'; |
| store tint into ':OUTPATH:.2'; |
| store bint into ':OUTPATH:.3'; |
| store binb into ':OUTPATH:.4'; |
| store tinb into ':OUTPATH:.5'; |
| ? |
| |
| }, { |
| #TEST arithmetic operation in TOTUPLE and TOBAG |
| #TEST aggregate funcion - NOT IMPLEMENTED |
| #TEST tuple with 50+ items |
| #TEST with null |
| 'num' => 7 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| B1= foreach B generate TOTUPLE( $1, $2, $3); |
| T1= foreach B generate TOTUPLE( $1, $2, $3); |
| R1= foreach B generate TOTUPLE( $1, $0+1, $0+2, $0+3),TOBAG($0+4, $0+1 ); |
| R2= foreach B generate TOTUPLE( $0, $1, $2, $3, $4, $5, $6, $7, (int) 8, (int) 9 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $6, $7 , (int) 29, (int) 30, $0, $1, $2, $3, $4, $5, $6, $7, (int) 39, (int) 40 , $1, $2, $3, $4, $5, $6, $7, (int) 19, (int) 20, $0, $1, $2, $3, $4, $5, $5, $7 ); |
| |
| R3= foreach B generate $0, TOTUPLE(0,0,0), TOBAG( 0, 0 ); |
| R4= foreach B generate $0, TOTUPLE(null, id, null), TOBAG( id, null, id,null ); |
| |
| describe R1; |
| describe R2; |
| describe R3; |
| describe R4; |
| |
| store B into ':OUTPATH:.1'; |
| store B1 into ':OUTPATH:.2'; |
| store R1 into ':OUTPATH:.3'; |
| store R2 into ':OUTPATH:.4'; |
| store R3 into ':OUTPATH:.5'; |
| store R4 into ':OUTPATH:.6'; |
| ? |
| |
| }, { |
| # TEST more TOTUPLE and TOBAG nested combinations |
| 'num' => 8 |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| C = foreach B generate TOBAG( $0, $1, $2); |
| T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) ); |
| T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); |
| T3= foreach B generate TOBAG( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7)); |
| store B into ':OUTPATH:.1'; |
| store C into ':OUTPATH:.2'; |
| store T1 into ':OUTPATH:.3'; |
| store T2 into ':OUTPATH:.4'; |
| store T3 into ':OUTPATH:.5'; |
| ? |
| ,'verify_pig_script' => q?register :FUNCPATH:/testudf.jar; |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| C = foreach B generate TOBAG( $0, $1, $2); |
| T1= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOTUPLE($3, $4, $5) ); |
| T2= foreach B generate TOTUPLE( TOBAG( $1, $2, $3),TOBAG($3, $4, $5) ); |
| T3= foreach B generate org.apache.pig.test.udf.evalfunc.TOBAG2( TOTUPLE( $1, $2, $3), TOTUPLE($4,$5), TOTUPLE($6,$7)); |
| store B into ':OUTPATH:.1'; |
| store C into ':OUTPATH:.2'; |
| store T1 into ':OUTPATH:.3'; |
| store T2 into ':OUTPATH:.4'; |
| store T3 into ':OUTPATH:.5'; |
| ? |
| }, { |
| #TEST negative test case: out of bounds positional parameter |
| # EVERYTHING IS CORRECT |
| 'num' => 9 |
| ,'ignore' => 1 # different error message for different version of hadoop |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| C = foreach B generate $0, $1, TOTUPLE($2, $998, $4), TOBAG($5, $6), $7; |
| ? |
| ,'expected_err_regex' => 'Out of bound access.*non-existent column: 998' |
| }, { |
| #TEST negative test case: out of bounds positional parameter |
| # EVERYTHING IS CORRECT |
| 'num' => 10 |
| ,'ignore' => 1 # different error message for different version of hadoop |
| ,'pig' => q? |
| A = load ':INPATH:/types/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); |
| B = limit A 10; |
| C = foreach B generate $0, $1, TOBAG($5, $999), $7; |
| ? |
| ,'expected_err_regex' => 'Out of bound access.*non-existent column: 999' |
| |
| }, |
| |
| ] # end of tests |
| |
| },{ |
| |
| 'name' => 'ToStuffSyntaxSugar', |
| 'tests' => [ |
| { |
| #TEST TOTUPLE syntax sugar |
| 'num' => 1, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate (name, age); |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate TOTUPLE(name, age); |
| store B into ':OUTPATH:';\, |
| }, { |
| #TEST TOBAG syntax sugar |
| 'num' => 2, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate {name, age}; |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate TOBAG(name, age); |
| store B into ':OUTPATH:';\, |
| }, { |
| #TEST TOMAP syntax sugar |
| 'num' => 3, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate [name, age]; |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate TOMAP(name, age); |
| store B into ':OUTPATH:';\, |
| }, { |
| #TEST verify single element inside parenthesis does NOT call TOTUPLE |
| 'num' => 4, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate (age) + 1; |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = foreach A generate (age + 1); |
| store B into ':OUTPATH:';\, |
| } |
| ] # end of tests |
| },{ |
| |
| 'name' => 'MergeOperator', |
| |
| 'tests' => [ |
| { |
| # Test Union using merge where schema is identical | A&B have identical schema |
| 'num' => 1, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| C = union A, B; |
| store C into ':OUTPATH:';\, |
| },{ |
| # Test Union using merge with type promotions, int->long and float->double |
| 'num' => 2, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:long, gpa:double); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| C = union A, B; |
| D = foreach C generate name, (long)age, (double)gpa; |
| store C into ':OUTPATH:';\, |
| },{ |
| # Test Union using merge with type promotions, int->float |
| 'num' => 3, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:float, gpa:float); |
| C = union A, B; |
| D = foreach C generate name, (float)age, gpa; |
| store C into ':OUTPATH:';\, |
| },{ |
| # Test Union using merge with type promotions, int->double |
| 'num' => 4, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:double, gpa:float); |
| C = union A, B; |
| D = foreach C generate name, (double)age, gpa; |
| store C into ':OUTPATH:';\, |
| },{ |
| # Test Union of an intersection |
| 'num' => 5, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| register :FUNCPATH:/testudf.jar; |
| define Nil org.apache.pig.test.udf.evalfunc.Nil(); |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float); |
| C = foreach A generate name, age, (chararray)gpa, Nil(), Nil(); |
| D = foreach B generate name, age, Nil(), registration, (chararray)contributions; |
| E = union C, D; |
| store E into ':OUTPATH:';\, |
| }, |
| { |
| # Test Union where the intersection is null |
| 'num' => 6, |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/textdoc' as (line:chararray); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| register :FUNCPATH:/testudf.jar; |
| define Nil org.apache.pig.test.udf.evalfunc.Nil(); |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int, gpa:float); |
| B = load ':INPATH:/singlefile/textdoc' as (line:chararray); |
| C = foreach A generate name, (chararray)age, (chararray)gpa, Nil(name); |
| D = foreach B generate Nil(line), Nil(line), Nil(line), line; |
| E = union C, D; |
| store E into ':OUTPATH:';\, |
| }, |
| { |
| # Test Union using merge where schema is identical | A&B have identical schema |
| 'num' => 7, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); |
| b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); |
| C = union onschema a, b; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| b = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| C = union a, b; |
| store C into ':OUTPATH:';\, |
| }, |
| { |
| # Test Union using merge with incompatible types. float->bytearray and chararray->bytearray |
| 'num' => 8, |
| 'delimiter' => ' ', |
| 'pig' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:int); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:chararray); |
| C = union onschema A, B; |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:bytearray); |
| B = load ':INPATH:/singlefile/studenttab10k' using PigStorage() as (name:chararray, age:bytearray); |
| C = union A, B; |
| store C into ':OUTPATH:';\, |
| } |
| ] |
| |
| }, |
| { |
| |
| # Test Union using merge with Simple data types |
| 'name' => 'UdfDistributedCache', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'java_params' => ['-Dopt.fetch=false'], |
| 'execonly' => 'mapred,tez', # since distributed cache is not supported in local mode |
| 'pig' => q? |
| register :FUNCPATH:/testudf.jar; |
| define udfdc org.apache.pig.test.udf.evalfunc.Udfcachetest(':INPATH:/singlefile/votertab10k#foodle'); |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = limit a 1; |
| c = foreach b generate udfdc(age); |
| dump c;?, |
| 'expected_out_regex' => ":UdfDistributedCache_1_out:", |
| }, |
| ] |
| }, { |
| 'name' => 'MonitoredUDF', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'ignore23' => 'guava version of Pig is higher than hadoop 23', |
| 'pig' => q?register :FUNCPATH:/testudf.jar; |
| define gm org.apache.pig.test.udf.evalfunc.GoodMonitored(); |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate gm(name); |
| store b into ':OUTPATH:';?, |
| 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = foreach a generate 'fred'; |
| store b into ':OUTPATH:';?, |
| },{ |
| 'num' => 2, |
| 'pig' => q?register :FUNCPATH:/testudf.jar; |
| define bad org.apache.pig.test.udf.evalfunc.BadMonitored(); |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = limit a 1; |
| c = foreach b generate bad(name); |
| store b into ':OUTPATH:';?, |
| 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = limit a 1; |
| c = foreach b generate ''; |
| store b into ':OUTPATH:';?, |
| },{ |
| 'num' => 3, |
| 'pig' => q?register :FUNCPATH:/testudf.jar; |
| define bad org.apache.pig.test.udf.evalfunc.BadMonitored(); |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = limit a 1; |
| c = foreach b generate bad(name); |
| store b into ':OUTPATH:';?, |
| 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = limit a 1; |
| c = foreach b generate 'barney'; |
| store b into ':OUTPATH:';?, |
| } |
| ], |
| },{ |
| 'name' => 'MergeSparseJoin', |
| 'tests' => [ |
| # Simplest merge-sparse-join. |
| { |
| 'num' => 1, |
| 'pig' => q\register :PIGGYBANKJAR: |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| g = join e by $0, f by $0 using 'merge-sparse'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| g = join a by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| # Merge-sparse-join with left-side filter |
| { |
| 'num' => 2, |
| 'pig' => q\register :PIGGYBANKJAR: |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| h = filter e by $1 > 30; |
| f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| g = join h by $0, f by $0 using 'merge-sparse'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| h = filter a by $1 > 30; |
| g = join h by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| # Merge-sparse-join with right-side filter |
| { |
| 'num' => 3, |
| 'pig' => q\register :PIGGYBANKJAR: |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| i = filter f by $2 != 'democrat'; |
| g = join e by $0, i by $0 using 'merge-sparse'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| i = filter b by $2 != 'democrat'; |
| g = join a by $0, i by $0; |
| store g into ':OUTPATH:';\, |
| |
| 'notmq' => 1, |
| }, |
| # Merge-sparse-join with key as expression |
| { |
| 'num' => 4, |
| 'pig' => q\register :PIGGYBANKJAR: |
| a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| c = order a by $0,$1; |
| d = order b by $0,$1; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1'); |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0,1'); |
| g = join e by ($0,$1), f by ($0,$1) using 'merge-sparse'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k'; |
| b = load ':INPATH:/singlefile/votertab10k'; |
| g = join a by ($0,$1), b by ($0,$1); |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| # Merge-sparse-join with nulls in keys and data. |
| { |
| 'num' => 5, |
| 'pig' => q\register :PIGGYBANKJAR: |
| a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = load ':INPATH:/singlefile/voternulltab10k'; |
| c = order a by $0; |
| d = order b by $0; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| exec; |
| e = load ':OUTPATH:.intermediate1'; |
| f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| g = join e by $0, f by $0 using 'merge-sparse'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k'; |
| b = load ':INPATH:/singlefile/voternulltab10k'; |
| g = join a by $0, b by $0; |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| }, |
| # Merge-sparse-join with join on numeric key |
| { |
| 'num' => 6, |
| 'pig' => q\register :PIGGYBANKJAR: |
| a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| c = order a by age; |
| d = order b by age; |
| store c into ':OUTPATH:.intermediate1'; |
| store d into ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0'); |
| exec; |
| e = load ':OUTPATH:.intermediate1' as (name:chararray, age:int, gpa:float); |
| f = load ':OUTPATH:.intermediate2' using org.apache.pig.piggybank.storage.IndexedStorage(',', '0') as (name:chararray, age:int, reg:chararray, contrib:float); |
| g = join e by age, f by age using 'merge-sparse'; |
| store g into ':OUTPATH:';\, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float); |
| b = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| g = join a by age, b by age; |
| store g into ':OUTPATH:';\, |
| 'notmq' => 1, |
| } |
| ], |
| },{ |
| 'name' => 'BugFix', |
| 'tests' => [ |
| { |
| # PIG-2286 |
| 'num' => 1, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double, gpa:double); |
| B = group A all; |
| C = foreach B generate group, flatten(COR(A.age, A.gpa)); |
| store C into ':OUTPATH:';?, |
| 'verify_pig_script' => q?set pig.exec.nocombiner true |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double); |
| B = group A all; |
| C = foreach B generate group, flatten(COR(A.age, A.gpa)); |
| store C into ':OUTPATH:';?, |
| }, { |
| # PIG-2286, with 3 inputs to COR |
| 'num' => 2, |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double); |
| B = foreach A generate age, gpa, gpa*gpa as gpa2; |
| C = group B all; |
| D = foreach C generate group, flatten(COR(B.age, B.gpa, B.gpa2)); |
| store D into ':OUTPATH:';?, |
| 'verify_pig_script' => q?set pig.exec.nocombiner true |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:double ,gpa:double); |
| B = foreach A generate age, gpa, gpa*gpa as gpa2; |
| C = group B all; |
| D = foreach C generate group, flatten(COR(B.age, B.gpa, B.gpa2)); |
| store D into ':OUTPATH:';?, |
| }, { |
| # PIG-2385 |
| 'num' => 3, |
| 'pig_params' => ['-M'], |
| 'floatpostprocess' => 1, |
| 'delimiter' => ' ', |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); |
| Z = group A all; |
| Z1 = foreach Z generate AVG(A.gpa) as avg; |
| B = foreach A generate name, age, gpa-Z1.avg as diff; |
| STORE B INTO ':OUTPATH:.1'; |
| C = DISTINCT B ; |
| store C into ':OUTPATH:.2';?, |
| 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); |
| Z = group A all; |
| Z1 = foreach Z generate AVG(A.gpa) as avg; |
| B = cross A, Z1; |
| B1 = foreach B generate name, age, gpa-Z1.avg as diff; |
| STORE B1 INTO ':OUTPATH:.1'; |
| C = DISTINCT B1 ; |
| store C into ':OUTPATH:.2';?, |
| }, { |
| # PIG-2576 |
| 'num' => 4, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q?register :FUNCPATH:/testudf.jar; |
| define printconf org.apache.pig.test.udf.evalfunc.UdfContextFrontend('dummy'); |
| a = load ':INPATH:/singlefile/studenttab10k' as (name, age, gpa); |
| b = limit a 1; |
| c = foreach b generate printconf(name); |
| store c into ':OUTPATH:'; |
| fs -ls; |
| ?, |
| 'rc' => 0, |
| 'not_expected_out_regex' => "checkJobConf: conf is null: false", |
| 'expected_out_regex' => "checkJobConf: conf is null: true", |
| }, { |
| # PIG-3051 |
| 'num' => 5, |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/votertab10k' AS (name, age, registration, contributions); |
| -- dropping one column to force columnprune |
| B = foreach A generate age, registration,contributions; |
| C = order B by contributions; |
| D = limit C 3; |
| E = foreach D generate contributions; |
| STORE C INTO ':OUTPATH:.1'; |
| STORE E INTO ':OUTPATH:.2';?, |
| 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/votertab10k' AS (name, age, registration, contributions); |
| B = foreach A generate age, registration,contributions; |
| C = order B by contributions; |
| STORE C INTO ':OUTPATH:.1'; |
| F = LOAD ':INPATH:/singlefile/votertab10k' AS (name, age, registration, contributions); |
| G = foreach F generate age, registration,contributions; |
| H = order G by contributions; |
| I = limit H 3; |
| J = foreach I generate contributions; |
| STORE J INTO ':OUTPATH:.2';?, |
| }, { |
| # PIG-3641 |
| 'num' => 6, |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/votertab10k' AS (name, age, registration, contributions); |
| -- dropping one column to force columnprune |
| B = foreach A generate name, age, registration; |
| -- Next line is the only difference |
| SPLIT B into C1 if age > 50, C2 otherwise; |
| D1 = foreach C1 generate age, registration; |
| STORE D1 INTO ':OUTPATH:.1'; |
| STORE C2 INTO ':OUTPATH:.2';?, |
| 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/votertab10k' AS (name, age, registration, contributions); |
| -- dropping one column to force columnprune |
| B = foreach A generate name, age, registration; |
| SPLIT B into C1 if age > 50, C2 if age <= 50; |
| D1 = foreach C1 generate age, registration; |
| STORE D1 INTO ':OUTPATH:.1'; |
| STORE C2 INTO ':OUTPATH:.2';?, |
| } |
| ], |
| },{ |
| 'name' => 'Bloom', |
| 'execonly' => 'mapred,tez', # distributed cache does not work in local mode |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| B = filter A by name == 'alice allen'; |
| C = group B all; |
| D = foreach C generate bb(B.name); |
| store D into ':HDFSTMP:/mybloom_1'; |
| exec; |
| define bloom Bloom(':HDFSTMP:/mybloom_1'); |
| E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| F = filter E by bloom(name); |
| store F into ':OUTPATH:';", |
| 'notmq' => 1, |
| 'verify_pig_script' => " |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); |
| B = filter A by name == 'alice allen'; |
| store B into ':OUTPATH:';", |
| }, { |
| 'num' => 2, |
| 'pig' => "define bb BuildBloom('Hash.MURMUR_HASH', 'fixed', '128', '3'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| B = filter A by name == 'alice allen'; |
| C = group B all; |
| D = foreach C generate bb(B.name); |
| store D into ':HDFSTMP:/mybloom_2'; |
| exec; |
| define bloom Bloom(':HDFSTMP:/mybloom_2'); |
| E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| F = filter E by bloom(name); |
| G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| H = join F by name, G by name; |
| store H into ':OUTPATH:';", |
| 'notmq' => 1, |
| 'verify_pig_script' => " |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); |
| B = filter A by name == 'alice allen'; |
| C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| D = join B by name, C by name; |
| store D into ':OUTPATH:';", |
| },{ |
| 'num' => 3, |
| 'pig' => "define bb BuildBloom('Hash.JENKINS_HASH', '1', '0.0001'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| B = filter A by name == 'alice allen'; |
| C = group B all; |
| D = foreach C generate bb(B.name); |
| store D into ':HDFSTMP:/mybloom_3'; |
| exec; |
| define bloom Bloom(':HDFSTMP:/mybloom_3'); |
| E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| F = filter E by bloom(name); |
| G = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| H = join G by name, F by name using 'repl'; |
| store H into ':OUTPATH:';", |
| 'notmq' => 1, |
| 'verify_pig_script' => " |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); |
| B = filter A by name == 'alice allen'; |
| C = load ':INPATH:/singlefile/votertab10k'as (name:chararray, age:int, reg:chararray, contrib:float); |
| D = join C by name, B by name; |
| store D into ':OUTPATH:';", |
| },{ |
| 'num' => 4, |
| 'pig' => "set pig.optimizer.rules.disabled PushUpFilter; |
| define bb BuildBloom('Hash.JENKINS_HASH', 'fixed', '128', '3'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| B = filter A by name == 'alice allen'; |
| C = group B all; |
| D = foreach C generate bb(B.name); |
| store D into ':HDFSTMP:/mybloom_4'; |
| exec; |
| define bloom Bloom(':HDFSTMP:/mybloom_4'); |
| E = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| F = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| G = union E, F; |
| -- PushUpFilter is disabled to avoid filter being pushed before union |
| H = filter G by bloom(name); |
| store H into ':OUTPATH:';", |
| 'notmq' => 1, |
| 'verify_pig_script' => " |
| A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); |
| B = LOAD ':INPATH:/singlefile/studenttab10k' AS (name, age:int ,gpa:double); |
| C = UNION A,B; |
| D = filter C by name == 'alice allen'; |
| store D into ':OUTPATH:';", |
| } |
| ], |
| },{ |
| 'name' => 'UDFContext', |
| 'tests' => [ |
| { |
| # See PIG-2338 |
| 'num' => 1, |
| 'pig' => q?register :FUNCPATH:/testudf.jar |
| a = load ':INPATH:/singlefile/studenttab10k' AS (a0); |
| b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0); |
| c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray); |
| d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0); |
| e = union b, d; |
| store e into ':OUTPATH:';?, |
| 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0); |
| b = foreach a generate '{a0: bytearray}'; |
| c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray); |
| d = foreach c generate '{c0: chararray}'; |
| e = union b, d; |
| store e into ':OUTPATH:';?, |
| } |
| ], |
| |
| },{ |
| 'name' => 'UDFContextAuto', |
| 'tests' => [ |
| { |
| # See PIG-2337 |
| 'num' => 1, |
| 'pig' => q?register :FUNCPATH:/testudf.jar |
| a = load ':INPATH:/singlefile/studenttab10k' AS (a0); |
| b = foreach a generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(a0); |
| c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray); |
| d = foreach c generate org.apache.pig.test.udf.evalfunc.UDFContextTestUDF(c0); |
| e = union b, d; |
| store e into ':OUTPATH:';?, |
| 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0); |
| b = foreach a generate '{a0: bytearray}'; |
| c = load ':INPATH:/singlefile/studenttab10k' AS (c0:chararray); |
| d = foreach c generate '{c0: chararray}'; |
| e = union b, d; |
| store e into ':OUTPATH:';?, |
| } |
| ], |
| },{ |
| 'name' => 'JsonLoaderStorage', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| store A into ':OUTPATH:.intermediate' using JsonStorage(); |
| exec |
| A = LOAD ':OUTPATH:.intermediate' using JsonLoader(); |
| store A into ':OUTPATH:';?, |
| 'notmq' => 1, |
| 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); |
| store A into ':OUTPATH:';?, |
| }, { |
| 'num' => 2, |
| 'pig' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int, gpa:double); |
| store A into ':OUTPATH:.intermediate1' using JsonStorage(); |
| B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double); |
| store B into ':OUTPATH:.intermediate2' using JsonStorage(); |
| exec |
| A = LOAD ':OUTPATH:.intermediate1' using JsonLoader(); |
| B = LOAD ':OUTPATH:.intermediate2' using JsonLoader(); |
| C = JOIN A by name, B by name; |
| store C into ':OUTPATH:';?, |
| 'notmq' => 1, |
| 'verify_pig_script' => q?A = LOAD ':INPATH:/singlefile/studenttab10k' AS (name:chararray, age:int,gpa:double); |
| B = LOAD ':INPATH:/singlefile/votertab10k' AS (name:chararray, age:int, registration:chararray, contributions:double); |
| C = JOIN A by name, B by name; |
| store C into ':OUTPATH:';?, |
| }, { |
| 'num' => 3, |
| 'ignore' => 1, # PIG-2594 |
| 'pig' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:boolean); |
| store a into ':OUTPATH:.intermediate' using JsonStorage(); |
| exec |
| B = LOAD ':OUTPATH:.intermediate' using JsonLoader(); |
| store B into ':OUTPATH:';\, |
| 'notmq' => 1, |
| 'verify_pig_script' => q\a = load ':INPATH:/singlefile/allscalar10k' using PigStorage() as (name:chararray, age:int, gpa:double, instate:chararray); |
| store a into ':OUTPATH:';\, |
| } |
| |
| ], |
| },{ |
| 'name' => 'STRSPLIT', |
| 'tests' => [ |
| { |
| # See PIG-2311 |
| 'num' => 1, |
| 'pig' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0); |
| b= filter a by NOT (a0 is null); |
| c= foreach b generate STRSPLIT(a0); |
| store c into ':OUTPATH:';?, |
| 'verify_pig_script' => q?a = load ':INPATH:/singlefile/studenttab10k' AS (a0); |
| b= filter a by NOT (a0 is null); |
| b= foreach b generate (chararray)a0 as a0 ; |
| c= foreach b generate STRSPLIT(a0); |
| store c into ':OUTPATH:';?, |
| } |
| ], |
| }, |
| { |
| 'name' => 'Tokenize', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k'; |
| B = foreach A generate TOKENIZE($0); |
| store B into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k'; |
| B = foreach A generate TOKENIZE($1,'9'); |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k'; |
| -- TOKENIZE has tokens hardcoded so have to replace the '9' with |
| -- one of the hardcoded tokens |
| B = foreach A generate TOKENIZE(REPLACE($1, '9', ',')); |
| store B into ':OUTPATH:';\, |
| } |
| ] |
| }, { |
| 'name' => 'Realias', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k'; |
| B = A; |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k'; |
| store A into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'NestedForEach', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = group A by name; |
| C = foreach B { |
| C1 = foreach A generate UPPER(name), age+1 as age, gpa; |
| generate C1; |
| } |
| D = foreach C generate flatten(C1); |
| store D into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = foreach A generate UPPER(name), age+1, gpa; |
| store B into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);; |
| B = group A by name; |
| C = foreach B { |
| C1 = A.age; |
| C2 = filter C1 by age>=30; |
| C3 = foreach C2 generate age+1 as age; |
| C4 = order C3 by age desc; |
| generate C4; |
| } |
| D = foreach C generate flatten(C4); |
| store D into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = filter A by age>=30; |
| C = foreach B generate age+1 as age; |
| D = order C by age desc; |
| store D into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'BagToTuple', |
| 'tests' => [ |
| { |
| # basic test of converting bag to tuples. Use the first and last tuple in the bag b |
| 'num' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| filterA = filter a by b is not null and COUNT(b) > 1; |
| b = foreach filterA { |
| order_desc = order b by age desc; |
| limit_desc = limit order_desc 1; |
| order_asc = order b by age asc; |
| limit_asc = limit order_asc 1; |
| generate FLATTEN(limit_desc), FLATTEN(limit_asc); |
| }; |
| c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa) as first_t, |
| TOTUPLE(limit_asc::name, limit_asc::age, limit_asc::gpa) as second_t; |
| d = foreach c generate TOBAG(first_t, second_t) as n_bag; |
| e = foreach d generate BagToTuple(n_bag); |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| filterA = filter a by b is not null and COUNT(b) > 1; |
| b = foreach filterA { |
| order_desc = order b by age desc; |
| limit_desc = limit order_desc 1; |
| order_asc = order b by age asc; |
| limit_asc = limit order_asc 1; |
| generate FLATTEN(limit_desc), FLATTEN(limit_asc); |
| }; |
| c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa, limit_asc::name,limit_asc::age, limit_asc::gpa) as big_t; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| # covert an existing tuple to bag and use the output of BagToTuple |
| 'num' => 2, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = filter a by t is not null; |
| c = foreach b generate TOBAG(t) as newBag; |
| d = foreach c generate BagToTuple(newBag); |
| store d into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = filter a by t is not null; |
| c = foreach b generate t; |
| store c into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'BagToString', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| filterA = filter a by b is not null and COUNT(b) > 1; |
| b = foreach filterA { |
| order_desc = order b by age desc; |
| limit_desc = limit order_desc 1; |
| order_asc = order b by age asc; |
| limit_asc = limit order_asc 1; |
| generate FLATTEN(limit_desc), FLATTEN(limit_asc); |
| }; |
| c = foreach b generate TOTUPLE(limit_desc::name, limit_desc::age, limit_desc::gpa) as first_t, |
| TOTUPLE(limit_asc::name, limit_asc::age, limit_asc::gpa) as second_t; |
| d = foreach c generate TOBAG(first_t, second_t) as n_bag; |
| e = foreach d generate BagToString(n_bag); |
| store e into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| filterA = filter a by b is not null and COUNT(b) > 1; |
| b = foreach filterA { |
| order_desc = order b by age desc; |
| limit_desc = limit order_desc 1; |
| order_asc = order b by age asc; |
| limit_asc = limit order_asc 1; |
| generate FLATTEN(limit_desc), FLATTEN(limit_asc); |
| }; |
| c = foreach b generate CONCAT(limit_desc::name, CONCAT('_', CONCAT((chararray)limit_desc::age, CONCAT('_', CONCAT((chararray)limit_desc::gpa, CONCAT('_',CONCAT(limit_asc::name,CONCAT('_',CONCAT((chararray)limit_asc::age, CONCAT('_',(chararray)limit_asc::gpa)))))))))) as big_t; |
| store c into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = filter a by t is not null; |
| c = foreach b generate TOBAG(t) as newBag; |
| d = foreach c generate BagToString(newBag); |
| store d into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| a = load ':INPATH:/singlefile/studentcomplextab10k' using PigStorage() as (m:[], t:(name:chararray, age:int, gpa:double), b:{t:(name:chararray, age:int, gpa:double)}); |
| b = filter a by t is not null; |
| c = foreach b generate CONCAT(t.name, CONCAT('_', CONCAT((chararray)t.age, CONCAT('_', (chararray)t.gpa)))); |
| store c into ':OUTPATH:';\, |
| }, |
| ] |
| }, |
| { |
| 'name' => 'NestedCross', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); |
| C = cogroup A by name, B by name; |
| D = foreach C { |
| C1 = cross A, B; |
| generate flatten(C1); |
| } |
| store D into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); |
| C = JOIN A by name, B by name; |
| store C into ':OUTPATH:';\, |
| }, |
| { |
| 'num' => 2, |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); |
| C = cogroup A by name, B by name; |
| D = foreach C { |
| C1 = filter A by gpa > 4; |
| C2 = filter B by contributions > 500; |
| C3 = cross C1, C2; |
| C4 = foreach C3 generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions); |
| generate flatten(C4); |
| } |
| store D into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = LOAD ':INPATH:/singlefile/votertab10k' as (name:chararray, age:int, registration, contributions:double); |
| C = filter A by gpa > 4; |
| D = filter B by contributions > 500; |
| E = JOIN C by name, D by name; |
| F = foreach E generate CONCAT(CONCAT((chararray)gpa, '_'), (chararray)contributions); |
| store F into ':OUTPATH:';\, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Rank', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 7; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = rank A; |
| C = foreach B generate rank_A,a,b,c; |
| store C into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rownumber,a,b,c; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' =>2, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 9; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = rank A by b DESC,a ASC; |
| C = foreach B generate rank_A,b,a; |
| store C into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rankbdaa,b,a; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' =>3, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 7; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = rank A by c ASC,b DESC; |
| C = foreach B generate rank_A,c,b; |
| store C into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rankcabd,c,b; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 4, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 25; |
| A = LOAD ':INPATH:/singlefile/biggish' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); |
| B = rank A; |
| C = order B by rank_A; |
| D = foreach C generate rank_A,rownumber; |
| store D into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/biggish' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); |
| D = foreach A generate idx,rownumber; |
| store D into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 5, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 11; |
| SET pig.splitCombination false; |
| A = LOAD ':INPATH:/singlefile/biggish' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); |
| B = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = join A by rownumber, B by rownumber; |
| D = order C by B::rankcabd,B::rankbdca,B::rankaaba; |
| E = rank D; |
| F = group E by rank_D; |
| G = foreach F generate group, COUNT(E); |
| H = order G by group; |
| store H into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,idx:long,tail:bytearray); |
| B = foreach A generate rownumber,1; |
| C = order B by rownumber; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 6, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| split A into M if rownumber > 15, N if rownumber < 25; |
| C = rank N; |
| D = foreach C generate $0, a, b, c; |
| store D into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = filter A by rownumber < 25; |
| D = foreach B generate rownumber, a, b, c; |
| store D into ':OUTPATH:'; |
| \, |
| } |
| ] |
| }, |
| { |
| 'name' => 'Rank_Dense', |
| 'tests' => [ |
| { |
| 'num' => 1, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 9; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = rank A by a ASC,b ASC DENSE; |
| C = foreach B generate rank_A,a,b; |
| store C into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rankaaba,a,b; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 2, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 9; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = rank A by a ASC,c DESC DENSE; |
| C = foreach B generate rank_A,a,c; |
| store C into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rankaacd,a,c; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 3, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 7; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = rank A by b DESC,c ASC DENSE; |
| C = foreach B generate rank_A,b,c; |
| store C into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rankbdca,b,c; |
| store C into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 4, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 7; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = foreach A generate a,b,c,tail; |
| C = rank B by a ASC,b ASC DENSE; |
| D = rank C by a ASC,c DESC DENSE; |
| E = rank D by b DESC,c ASC DENSE; |
| F = foreach E generate rank_D,rank_C,rank_B,a,b,c; |
| store F into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = foreach A generate rankbdca,rankaacd,rankaaba,a,b,c; |
| store B into ':OUTPATH:'; |
| \, |
| }, { |
| 'num' => 5, |
| 'execonly' => 'mapred,tez', |
| 'pig' => q\ |
| SET default_parallel 9; |
| SET pig.splitCombination false; |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| B = foreach A generate a,b,c; |
| C = rank B by a ASC,b ASC DENSE; |
| D = rank B by a ASC,c DESC DENSE; |
| F = join C by $0, D by $0; |
| G = foreach F generate C::rank_B, D::rank_B, C::a, C::b, C::c; |
| H = order G by a ASC, b ASC, c DESC; |
| store H into ':OUTPATH:'; |
| \, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/prerank' using PigStorage(',') as (rownumber:long,rankcabd:long,rankbdaa:long,rankbdca:long,rankaacd:long,rankaaba:long,a:int,b:int,c:int,tail:bytearray); |
| C = foreach A generate rankaaba,a,b,c; |
| E = order C by a ASC,b ASC; |
| D = foreach A generate rankaacd,a,b,c; |
| F = order D by a ASC,c DESC; |
| G = join E by $0, F by $0; |
| H = foreach G generate E::rankaaba, F::rankaacd, E::a, E::b, E::c; |
| store H into ':OUTPATH:'; |
| \, |
| } |
| ] |
| }, |
| { |
| 'name' => 'HiveUDF', |
| 'tests' => [ |
| { |
| # HiveUDF extends UDF |
| 'num' => 1, |
| 'pig' => q\ |
| define sin HiveUDF('sin'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = foreach A generate sin(gpa); |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = foreach A generate SIN(gpa); |
| store B into ':OUTPATH:';\, |
| }, |
| { |
| # HiveUDF extends GenericUDF |
| 'num' => 2, |
| 'pig' => q\ |
| define upper HiveUDF('upper'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = foreach A generate upper(name); |
| store B into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = foreach A generate UPPER(name); |
| store B into ':OUTPATH:';\, |
| }, |
| { |
| # HiveUDTF |
| 'num' => 3, |
| 'pig' => q\ |
| define explode HiveUDTF('explode'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray); |
| B = foreach A generate TOBAG(name, age, gpa) as b; |
| C = foreach B generate flatten(explode(b)); |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:chararray, gpa:chararray); |
| B = foreach A generate TOBAG(name, age, gpa) as b; |
| C = foreach B generate flatten(b); |
| store C into ':OUTPATH:';\, |
| }, |
| { |
| # HiveUDAF extends GenericUDAF, with null handling |
| 'num' => 4, |
| 'pig' => q\ |
| define avg HiveUDAF('avg'); |
| A = LOAD ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| B = group A by name; |
| C = foreach B generate group, avg(A.age); |
| store C into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| A = LOAD ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:double); |
| B = group A by name; |
| C = foreach B generate group, AVG(A.age); |
| store C into ':OUTPATH:';\, |
| }, |
| { |
| # HiveUDAF extends UDAF |
| 'num' => 5, |
| 'pig' => q\ |
| define percentile HiveUDAF('percentile'); |
| A = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:long, gpa:double); |
| B = foreach A generate name, age, 0.5 as perc; |
| C = group B by name; |
| D = foreach C generate group, percentile(B.(age, perc)); |
| store D into ':OUTPATH:';\, |
| 'verify_pig_script' => q\ |
| register :FUNCPATH:/datafu.jar |
| define Quartile datafu.pig.stats.Quantile('0.5'); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:long, gpa:double); |
| B = group A by name; |
| C = foreach B { |
| sorted = order A by age; |
| generate group, flatten(Quartile(sorted.age)); |
| } |
| store C into ':OUTPATH:';\, |
| }, |
| { |
| # Constant folding and ship jars |
| 'num' => 6, |
| 'pig' => q# |
| sh echo -e "zach young\nzach zipper" > names.txt |
| define in_file HiveUDF('in_file', '(null, "names.txt")'); |
| A = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:long, gpa:double); |
| B = foreach A generate in_file(name, 'names.txt'); |
| store B into ':OUTPATH:';#, |
| 'verify_pig_script' => q#register :PIGGYBANKJAR: |
| sh echo -e "zach young\nzach zipper" > names.txt |
| rmf :INPATH:/singlefile/names.txt |
| fs -put names.txt :INPATH:/singlefile/names.txt |
| define LookupInFiles org.apache.pig.piggybank.evaluation.string.LookupInFiles(); |
| A = LOAD ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:long, gpa:double); |
| B = foreach A generate LookupInFiles(name, ':INPATH:/singlefile/names.txt'); |
| C = foreach B generate (boolean)$0; |
| store C into ':OUTPATH:'; |
| fs -rm :INPATH:/singlefile/names.txt# |
| }, |
| { |
| # Custom Hive UDF and MapredContext |
| 'num' => 7, |
| 'pig' => q\set mapred.max.split.size '100000000' |
| register :FUNCPATH:/testudf.jar; |
| define DummyContextUDF HiveUDF('org.apache.pig.test.udf.evalfunc.DummyContextUDF'); |
| A = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double); |
| B = foreach A generate DummyContextUDF(age); |
| store B into ':OUTPATH:';\, |
| 'expected_err_regex' => "Encountered Warning UDF_WARNING_1 4610 time.*", |
| } |
| ] |
| } |
| ], |
| }, |
| ; |
| |
| |
| |