ql/src/test/queries/clientpositive/orc_split_elimination.q - hive - Git at Google

 set hive.vectorized.execution.enabled=false;

 -- SORT_QUERY_RESULTS

 create table orc_split_elim (userid bigint, string1 string, subtype double, decimal1 decimal(38,0), ts timestamp) stored as orc;

 load data local inpath '../../data/files/orc_split_elim.orc' into table orc_split_elim;

 SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
 SET mapred.min.split.size=1000;
 SET mapred.max.split.size=5000;
 SET hive.optimize.index.filter=false;

 -- The above table will have 5 splits with the followings stats
 --  Stripe 1:
 --    Column 0: count: 5000
 --    Column 1: count: 5000 min: 2 max: 100 sum: 499902
 --    Column 2: count: 5000 min: foo max: zebra sum: 24998
 --    Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8
 --    Column 4: count: 5000 min: 0 max: 1.2 sum: 1.2
 --    Column 5: count: 5000
 --  Stripe 2:
 --    Column 0: count: 5000
 --    Column 1: count: 5000 min: 13 max: 100 sum: 499913
 --    Column 2: count: 5000 min: bar max: zebra sum: 24998
 --    Column 3: count: 5000 min: 8.0 max: 80.0 sum: 40072.0
 --    Column 4: count: 5000 min: 0 max: 2.2 sum: 2.2
 --    Column 5: count: 5000
 --  Stripe 3:
 --    Column 0: count: 5000
 --    Column 1: count: 5000 min: 29 max: 100 sum: 499929
 --    Column 2: count: 5000 min: cat max: zebra sum: 24998
 --    Column 3: count: 5000 min: 8.0 max: 8.0 sum: 40000.0
 --    Column 4: count: 5000 min: 0 max: 3.3 sum: 3.3
 --    Column 5: count: 5000
 --  Stripe 4:
 --    Column 0: count: 5000
 --    Column 1: count: 5000 min: 70 max: 100 sum: 499970
 --    Column 2: count: 5000 min: dog max: zebra sum: 24998
 --    Column 3: count: 5000 min: 1.8 max: 8.0 sum: 39993.8
 --    Column 4: count: 5000 min: 0 max: 4.4 sum: 4.4
 --    Column 5: count: 5000
 --  Stripe 5:
 --    Column 0: count: 5000
 --    Column 1: count: 5000 min: 5 max: 100 sum: 499905
 --    Column 2: count: 5000 min: eat max: zebra sum: 24998
 --    Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8
 --    Column 4: count: 5000 min: 0 max: 5.5 sum: 5.5
 --    Column 5: count: 5000

 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0;

 SET hive.optimize.index.filter=true;
 -- 0 mapper
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0;
 SET hive.optimize.index.filter=false;

 -- 5 mappers. count should be 0
 select count(*) from orc_split_elim where userid<=0;

 SET hive.optimize.index.filter=true;
 -- 0 mapper
 select count(*) from orc_split_elim where userid<=0;
 SET hive.optimize.index.filter=false;

 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2;

 SET hive.optimize.index.filter=true;
 -- 1 mapper
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2;
 SET hive.optimize.index.filter=false;

 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5;

 SET hive.optimize.index.filter=true;
 -- 2 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5;
 SET hive.optimize.index.filter=false;

 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13;

 SET hive.optimize.index.filter=true;
 -- 3 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13;
 SET hive.optimize.index.filter=false;

 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29;

 SET hive.optimize.index.filter=true;
 -- 4 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29;
 SET hive.optimize.index.filter=false;

 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70;

 SET hive.optimize.index.filter=true;
 -- 5 mappers
 select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70;
 SET hive.optimize.index.filter=false;

 -- partitioned table
 create table orc_split_elim_part (userid bigint, string1 string, subtype double, decimal1 decimal(38,0), ts timestamp) partitioned by (country string, year int) stored as orc;

 alter table orc_split_elim_part add partition(country='us', year=2000);
 alter table orc_split_elim_part add partition(country='us', year=2001);

 load data local inpath '../../data/files/orc_split_elim.orc' into table orc_split_elim_part partition(country='us', year=2000);
 load data local inpath '../../data/files/orc_split_elim.orc' into table orc_split_elim_part partition(country='us', year=2001);

 -- 10 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us';

 SET hive.optimize.index.filter=true;
 -- 2 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us';
 SET hive.optimize.index.filter=false;

 -- 10 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and (year=2000 or year=2001);

 SET hive.optimize.index.filter=true;
 -- 2 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and (year=2000 or year=2001);
 SET hive.optimize.index.filter=false;

 -- 10 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and year=2000;

 SET hive.optimize.index.filter=true;
 -- 1 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and year=2000;
 SET hive.optimize.index.filter=false;

 -- 10 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us';

 SET hive.optimize.index.filter=true;
 -- 4 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us';
 SET hive.optimize.index.filter=false;

 -- 10 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and (year=2000 or year=2001);

 SET hive.optimize.index.filter=true;
 -- 4 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and (year=2000 or year=2001);
 SET hive.optimize.index.filter=false;

 -- 10 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and year=2000;

 SET hive.optimize.index.filter=true;
 -- 2 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and year=2000;
 SET hive.optimize.index.filter=false;

 -- 0 mapper - no split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='in';
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='us' and year=2002;

 SET hive.optimize.index.filter=true;
 -- 0 mapper - split elimination
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='in';
 select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='us' and year=2002;
 SET hive.optimize.index.filter=false;
	set hive.vectorized.execution.enabled=false;

	-- SORT_QUERY_RESULTS

	create table orc_split_elim (userid bigint, string1 string, subtype double, decimal1 decimal(38,0), ts timestamp) stored as orc;

	load data local inpath '../../data/files/orc_split_elim.orc' into table orc_split_elim;

	SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
	SET mapred.min.split.size=1000;
	SET mapred.max.split.size=5000;
	SET hive.optimize.index.filter=false;

	-- The above table will have 5 splits with the followings stats
	-- Stripe 1:
	-- Column 0: count: 5000
	-- Column 1: count: 5000 min: 2 max: 100 sum: 499902
	-- Column 2: count: 5000 min: foo max: zebra sum: 24998
	-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8
	-- Column 4: count: 5000 min: 0 max: 1.2 sum: 1.2
	-- Column 5: count: 5000
	-- Stripe 2:
	-- Column 0: count: 5000
	-- Column 1: count: 5000 min: 13 max: 100 sum: 499913
	-- Column 2: count: 5000 min: bar max: zebra sum: 24998
	-- Column 3: count: 5000 min: 8.0 max: 80.0 sum: 40072.0
	-- Column 4: count: 5000 min: 0 max: 2.2 sum: 2.2
	-- Column 5: count: 5000
	-- Stripe 3:
	-- Column 0: count: 5000
	-- Column 1: count: 5000 min: 29 max: 100 sum: 499929
	-- Column 2: count: 5000 min: cat max: zebra sum: 24998
	-- Column 3: count: 5000 min: 8.0 max: 8.0 sum: 40000.0
	-- Column 4: count: 5000 min: 0 max: 3.3 sum: 3.3
	-- Column 5: count: 5000
	-- Stripe 4:
	-- Column 0: count: 5000
	-- Column 1: count: 5000 min: 70 max: 100 sum: 499970
	-- Column 2: count: 5000 min: dog max: zebra sum: 24998
	-- Column 3: count: 5000 min: 1.8 max: 8.0 sum: 39993.8
	-- Column 4: count: 5000 min: 0 max: 4.4 sum: 4.4
	-- Column 5: count: 5000
	-- Stripe 5:
	-- Column 0: count: 5000
	-- Column 1: count: 5000 min: 5 max: 100 sum: 499905
	-- Column 2: count: 5000 min: eat max: zebra sum: 24998
	-- Column 3: count: 5000 min: 0.8 max: 8.0 sum: 39992.8
	-- Column 4: count: 5000 min: 0 max: 5.5 sum: 5.5
	-- Column 5: count: 5000

	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0;

	SET hive.optimize.index.filter=true;
	-- 0 mapper
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=0;
	SET hive.optimize.index.filter=false;

	-- 5 mappers. count should be 0
	select count(*) from orc_split_elim where userid<=0;

	SET hive.optimize.index.filter=true;
	-- 0 mapper
	select count(*) from orc_split_elim where userid<=0;
	SET hive.optimize.index.filter=false;

	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2;

	SET hive.optimize.index.filter=true;
	-- 1 mapper
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=2;
	SET hive.optimize.index.filter=false;

	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5;

	SET hive.optimize.index.filter=true;
	-- 2 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=5;
	SET hive.optimize.index.filter=false;

	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13;

	SET hive.optimize.index.filter=true;
	-- 3 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=13;
	SET hive.optimize.index.filter=false;

	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29;

	SET hive.optimize.index.filter=true;
	-- 4 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=29;
	SET hive.optimize.index.filter=false;

	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70;

	SET hive.optimize.index.filter=true;
	-- 5 mappers
	select userid,string1,subtype,decimal1,ts from orc_split_elim where userid<=70;
	SET hive.optimize.index.filter=false;

	-- partitioned table
	create table orc_split_elim_part (userid bigint, string1 string, subtype double, decimal1 decimal(38,0), ts timestamp) partitioned by (country string, year int) stored as orc;

	alter table orc_split_elim_part add partition(country='us', year=2000);
	alter table orc_split_elim_part add partition(country='us', year=2001);

	load data local inpath '../../data/files/orc_split_elim.orc' into table orc_split_elim_part partition(country='us', year=2000);
	load data local inpath '../../data/files/orc_split_elim.orc' into table orc_split_elim_part partition(country='us', year=2001);

	-- 10 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us';

	SET hive.optimize.index.filter=true;
	-- 2 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us';
	SET hive.optimize.index.filter=false;

	-- 10 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and (year=2000 or year=2001);

	SET hive.optimize.index.filter=true;
	-- 2 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and (year=2000 or year=2001);
	SET hive.optimize.index.filter=false;

	-- 10 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and year=2000;

	SET hive.optimize.index.filter=true;
	-- 1 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=2 and country='us' and year=2000;
	SET hive.optimize.index.filter=false;

	-- 10 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us';

	SET hive.optimize.index.filter=true;
	-- 4 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us';
	SET hive.optimize.index.filter=false;

	-- 10 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and (year=2000 or year=2001);

	SET hive.optimize.index.filter=true;
	-- 4 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and (year=2000 or year=2001);
	SET hive.optimize.index.filter=false;

	-- 10 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and year=2000;

	SET hive.optimize.index.filter=true;
	-- 2 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=5 and country='us' and year=2000;
	SET hive.optimize.index.filter=false;

	-- 0 mapper - no split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='in';
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='us' and year=2002;

	SET hive.optimize.index.filter=true;
	-- 0 mapper - split elimination
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='in';
	select userid,string1,subtype,decimal1,ts from orc_split_elim_part where userid<=70 and country='us' and year=2002;
	SET hive.optimize.index.filter=false;