blob: a023ea7d1175764e9975cef02eb2fe64b426109b [file] [log] [blame]
#!/usr/bin/env perl
############################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
$cfg = {
'driver' => 'Pig',
'nummachines' => 5,
'verify_with_pig' => 1,
'verify_pig_version' => 'old',
'groups' => [
{
'name' => 'Orc',
'tests' => [
# Test 1: Load (primitive) from PigStorage and store into OrcStorage
# Also tests multiple load stores in same script
{
'num' => 1,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
store a into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = filter b by age < 30;
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
b = filter a by age < 30;
store b into ':OUTPATH:';\,
},
# Test 2: Load (complex) from PigStorage and store into OrcStorage
{
'num' => 2,
'notmq' => 1,
'execonly' => 'mapred,tez',
'pig' => q\
a = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)});
store a into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
store b into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)});
store a into ':OUTPATH:';\,
},
# Test 3: Aggregation test using two ORCStorage datasets
# Also incorporates handling bytearrays
{
'num' => 3,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
store a into ':OUTPATH:.simple.intermediate' using OrcStorage();
exec
b = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)}, nameagegpamap_name:chararray, nameagegpamap_age:int, nameagegpamap_gpa:float);
store b into ':OUTPATH:.complex.intermediate' using OrcStorage();
exec
c = load ':OUTPATH:.simple.intermediate' using OrcStorage();
d = load ':OUTPATH:.complex.intermediate' using OrcStorage();
e = foreach c generate name, age, gpa;
f = foreach d generate nameagegpamap#'name' as name, nameagegpamap#'age' as age, nameagegpamap#'gpa' as gpa, nameagegpatuple.tage as tage, FLATTEN(nameagegpabag) as (bname, bage, bgpa);
g = join e by name, f by name;
h = group g by (f::bgpa);
j = foreach h generate group, COUNT(g) as students;
k = order j by group, students;
store k into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age:int, gpa:float);
b = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)}, nameagegpamap_name:chararray, nameagegpamap_age:int, nameagegpamap_gpa:float);
c = foreach a generate name, age, gpa;
d = foreach b generate nameagegpamap#'name' as name, nameagegpamap#'age' as age, nameagegpamap#'gpa' as gpa, nameagegpatuple.tage as tage, FLATTEN(nameagegpabag) as (bname, bage, bgpa);
e = join c by name, d by name;
f = group e by (d::bgpa);
g = foreach f generate group, COUNT(e) as students;
h = order g by group, students;
store h into ':OUTPATH:';\,
},
# Tests 4 : Test various properties passed to ORCStorage
{
'num' => 4,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
store a into ':OUTPATH:.orc_params.intermediate' using OrcStorage('-c ZLIB -s 67108864 -r 100000 -b 1048576 -p true -v 0.12');
exec
b = load ':OUTPATH:.orc_params.intermediate' using OrcStorage();
store b into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
store a into ':OUTPATH:';\,
},
# Tests 5 : Test loading null map key
{
'num' => 5,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age);
b = foreach a generate TOMAP(name, age) as m;
store b into ':OUTPATH:.intermediate' using OrcStorage();
exec
c = load ':OUTPATH:.intermediate' using OrcStorage();
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentnulltab10k' as (name:chararray, age);
b = foreach a generate (name is null ? [] : TOMAP(name, age));
store b into ':OUTPATH:';\,
},
# Test 6 : Running for Spark only as a replacement of Test 2: Spark and MR may produce different order of entries in
# Pig maps, which although is fine, triggers a false failure during comparison
{
'num' => 6,
'notmq' => 1,
'execonly' => 'spark',
'pig' => q\
a = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)});
store a into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = foreach b generate nameagegpamap#'name', nameagegpamap#'age', nameagegpamap#'gpa', nameagegpatuple, nameagegpabag;
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studentcomplextab10k' as (nameagegpamap:map[], nameagegpatuple:tuple(tname:chararray, tage:int, tgpa:float), nameagegpabag:bag{t:tuple(bname:chararray, bage:int, bgpa:float)});
b = foreach a generate nameagegpamap#'name', nameagegpamap#'age', nameagegpamap#'gpa', nameagegpatuple, nameagegpabag;
store b into ':OUTPATH:';\,
}
]
},
{
'name' => 'Orc_Pushdown',
'tests' => [
# Test 1: Load (primitive) from PigStorage and store into OrcStorage
# Also tests multiple load stores in same script
{
'num' => 1,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
b = order a by name parallel 4;
store b into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = filter b by name < 'david falkner';
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
b = filter a by name < 'david falkner';
store b into ':OUTPATH:';\,
},
{
'num' => 2,
'notmq' => 1,
'execonly' => 'mapred,tez,spark', # studenttab20m not available in local mode
'pig' => q\
a = load ':INPATH:/singlefile/studenttab20m' as (name:chararray, age:int, gpa:float);
b = order a by age desc parallel 4;
store b into ':OUTPATH:.intermediate' using OrcStorage('-s 10000000');
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = filter b by age <= 22;
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab20m' as (name:chararray, age:int, gpa:float);
b = filter a by age <= 22;
store b into ':OUTPATH:';\,
},
{
'num' => 3,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
b = order a by gpa parallel 4;
store b into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = filter b by gpa >= 3.2 and gpa < 3.5 and age > 30 + 2;
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:float);
b = filter a by gpa >= 3.2 and gpa < 3.5 and age > 30 + 2;
store b into ':OUTPATH:';\,
},
{
'num' => 4,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:bigdecimal);
b = order a by gpa parallel 4;
store b into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = filter b by gpa >= 3.5;
store c into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:bigdecimal);
b = filter a by gpa >= 3.5;
store b into ':OUTPATH:';\,
'floatpostprocess' => 1,
'delimiter' => ' ',
},
{
'num' => 5,
'notmq' => 1,
'pig' => q\
a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:double);
b = foreach a generate name, age, gpa, (age>35 ? ToDate('20100101', 'yyyyMMdd', 'UTC') : ToDate('20100105', 'yyyyMMdd', 'UTC')) as d;
c = order b by d parallel 4;
store c into ':OUTPATH:.intermediate' using OrcStorage();
exec
b = load ':OUTPATH:.intermediate' using OrcStorage();
c = filter b by d >= ToDate('20100103', 'yyyyMMdd', 'UTC');
d = foreach c generate name, age, gpa;
store d into ':OUTPATH:';\,
'verify_pig_script' => q\a = load ':INPATH:/singlefile/studenttab10k' as (name:chararray, age:int, gpa:bigdecimal);
b = filter a by age<=35;
store b into ':OUTPATH:';\,
'floatpostprocess' => 1,
'delimiter' => ' ',
},
]
},
]
};