blob: f5a5824644896b544c7f1e8d908be067174070e7 [file] [log] [blame]
--! qt:dataset:srcpart
--! qt:dataset:alltypesorc
set hive.compute.query.using.stats=false;
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
set hive.optimize.ppd=true;
set hive.ppd.remove.duplicatefilters=true;
set hive.tez.dynamic.partition.pruning=true;
set hive.tez.dynamic.semijoin.reduction=true;
set hive.optimize.metadataonly=false;
set hive.optimize.index.filter=true;
set hive.stats.autogather=true;
set hive.tez.bigtable.minsize.semijoin.reduction=1;
set hive.tez.min.bloom.filter.entries=1;
set hive.stats.fetch.column.stats=true;
set hive.tez.bloom.filter.factor=1.0f;
-- Create Tables
create table alltypesorc_int_n1 ( cint int, cstring string ) stored as ORC;
create table srcpart_date_n7 (key string, value string) partitioned by (ds string ) stored as ORC;
CREATE TABLE srcpart_small_n3(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC;
-- Add Partitions
alter table srcpart_date_n7 add partition (ds = "2008-04-08");
alter table srcpart_date_n7 add partition (ds = "2008-04-09");
alter table srcpart_small_n3 add partition (ds = "2008-04-08");
alter table srcpart_small_n3 add partition (ds = "2008-04-09");
-- Load
insert overwrite table alltypesorc_int_n1 select cint, cstring1 from alltypesorc;
insert overwrite table srcpart_date_n7 partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08";
insert overwrite table srcpart_date_n7 partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09";
insert overwrite table srcpart_small_n3 partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" limit 20;
set hive.tez.dynamic.semijoin.reduction=false;
analyze table alltypesorc_int_n1 compute statistics for columns;
analyze table srcpart_date_n7 compute statistics for columns;
analyze table srcpart_small_n3 compute statistics for columns;
-- single column, single key
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
set hive.tez.dynamic.semijoin.reduction=true;
-- Mix dynamic partition pruning(DPP) and min/max bloom filter optimizations. Should pick the DPP.
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.ds);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.ds);
set hive.tez.dynamic.semijoin.reduction=false;
--multiple sources, single key
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_small_n3.key1 = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_small_n3.key1 = alltypesorc_int_n1.cstring);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_small_n3.key1 = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_small_n3.key1 = alltypesorc_int_n1.cstring);
set hive.tez.dynamic.semijoin.reduction=false;
-- single source, multiple keys
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1 and srcpart_date_n7.value = srcpart_small_n3.value1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1 and srcpart_date_n7.value = srcpart_small_n3.value1);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1 and srcpart_date_n7.value = srcpart_small_n3.value1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1 and srcpart_date_n7.value = srcpart_small_n3.value1);
set hive.tez.dynamic.semijoin.reduction=false;
-- multiple sources, different keys
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
-- Explain extended to verify fast start for Reducer in semijoin branch
EXPLAIN extended select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
set hive.tez.dynamic.semijoin.reduction=false;
-- With Mapjoins, there shouldn't be any semijoin parallel to mapjoin.
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=100000000000;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
set hive.tez.dynamic.semijoin.reduction.for.mapjoin=true;
-- Enable semijoin parallel to mapjoins.
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1);
set hive.tez.dynamic.semijoin.reduction.for.mapjoin=false;
set hive.tez.dynamic.semijoin.reduction=false;
-- multiple sources, different keys
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
set hive.tez.dynamic.semijoin.reduction.for.mapjoin=true;
-- Enable semijoin parallel to mapjoins.
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1) join alltypesorc_int_n1 on (srcpart_date_n7.value = alltypesorc_int_n1.cstring);
set hive.tez.dynamic.semijoin.reduction.for.mapjoin=false;
-- HIVE-17323 - with DPP, the 1st mapjoin is on a map with DPP and 2nd mapjoin is on a map which had semijoin but still removed.
create table alltypesorc_int40 as select * from alltypesorc_int_n1 limit 40;
set hive.tez.dynamic.semijoin.reduction=false;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.ds = srcpart_small_n3.ds) join alltypesorc_int40 on (srcpart_date_n7.value = alltypesorc_int40.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.ds = srcpart_small_n3.ds) join alltypesorc_int40 on (srcpart_date_n7.value = alltypesorc_int40.cstring);
set hive.tez.dynamic.semijoin.reduction=true;
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.ds = srcpart_small_n3.ds) join alltypesorc_int40 on (srcpart_date_n7.value = alltypesorc_int40.cstring);
select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.ds = srcpart_small_n3.ds) join alltypesorc_int40 on (srcpart_date_n7.value = alltypesorc_int40.cstring);
-- HIVE-17399
create table srcpart_small10 as select * from srcpart_small_n3 limit 10;
analyze table srcpart_small10 compute statistics for columns;
set hive.tez.dynamic.semijoin.reduction=false;
EXPLAIN select count(*) from srcpart_small10, srcpart_small_n3, srcpart_date_n7 where srcpart_small_n3.key1 = srcpart_small10.key1 and srcpart_date_n7.ds = srcpart_small_n3.ds;
select count(*) from srcpart_small10, srcpart_small_n3, srcpart_date_n7 where srcpart_small_n3.key1 = srcpart_small10.key1 and srcpart_date_n7.ds = srcpart_small_n3.ds;
set hive.tez.dynamic.semijoin.reduction=true;
set hive.llap.object.cache.enabled=false;
EXPLAIN select count(*) from srcpart_small10, srcpart_small_n3, srcpart_date_n7 where srcpart_small_n3.key1 = srcpart_small10.key1 and srcpart_date_n7.ds = srcpart_small_n3.ds;
select count(*) from srcpart_small10, srcpart_small_n3, srcpart_date_n7 where srcpart_small_n3.key1 = srcpart_small10.key1 and srcpart_date_n7.ds = srcpart_small_n3.ds;
-- HIVE-17936
set hive.tez.dynamic.semijoin.reduction.for.dpp.factor = 0.75;
EXPLAIN select count(*) from srcpart_small10, srcpart_small_n3, srcpart_date_n7 where srcpart_small_n3.key1 = srcpart_small10.key1 and srcpart_date_n7.ds = srcpart_small_n3.ds;
-- semijoin branch should be removed.
set hive.tez.dynamic.semijoin.reduction.for.dpp.factor = 0.4;
EXPLAIN select count(*) from srcpart_small10, srcpart_small_n3, srcpart_date_n7 where srcpart_small_n3.key1 = srcpart_small10.key1 and srcpart_date_n7.ds = srcpart_small_n3.ds;
-- With unions
explain select * from alltypesorc_int_n1 join
(select srcpart_date_n7.key as key from srcpart_date_n7
union all
select srcpart_small_n3.key1 as key from srcpart_small_n3) unionsrc on (alltypesorc_int_n1.cstring = unionsrc.key);
-- Non equi joins : Should NOT create a semijoin branch
EXPLAIN select count(*) from srcpart_date_n7 join srcpart_small_n3 on (srcpart_date_n7.key = srcpart_small_n3.key1 and srcpart_date_n7.value > srcpart_small_n3.value1);
drop table srcpart_date_n7;
drop table srcpart_small_n3;
drop table alltypesorc_int_n1;