blob: 0fbb3855eb1acf1d9041361a03e77ca5b743b425 [file] [log] [blame]
--! qt:dataset:src
set hive.stats.filter.range.uniform=false;
set hive.stats.column.autogather=false;
set hive.strict.checks.bucketing=false;
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=30000;
set hive.llap.memory.oversubscription.max.executors.per.query=3;
CREATE TABLE srcbucket_mapjoin_n18(key int, value string) partitioned by (ds string) CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
CREATE TABLE tab_part_n11 (key int, value string) PARTITIONED BY(ds STRING) CLUSTERED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
CREATE TABLE srcbucket_mapjoin_part_n20 (key int, value string) partitioned by (ds string) CLUSTERED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
load data local inpath '../../data/files/bmj/000000_0' INTO TABLE srcbucket_mapjoin_n18 partition(ds='2008-04-08');
load data local inpath '../../data/files/bmj1/000001_0' INTO TABLE srcbucket_mapjoin_n18 partition(ds='2008-04-08');
load data local inpath '../../data/files/bmj/000000_0' INTO TABLE srcbucket_mapjoin_part_n20 partition(ds='2008-04-08');
load data local inpath '../../data/files/bmj/000001_0' INTO TABLE srcbucket_mapjoin_part_n20 partition(ds='2008-04-08');
load data local inpath '../../data/files/bmj/000002_0' INTO TABLE srcbucket_mapjoin_part_n20 partition(ds='2008-04-08');
load data local inpath '../../data/files/bmj/000003_0' INTO TABLE srcbucket_mapjoin_part_n20 partition(ds='2008-04-08');
set hive.optimize.bucketingsorting=false;
insert overwrite table tab_part_n11 partition (ds='2008-04-08')
select key,value from srcbucket_mapjoin_part_n20;
CREATE TABLE tab_n10(key int, value string) PARTITIONED BY(ds STRING) CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
insert overwrite table tab_n10 partition (ds='2008-04-08')
select key,value from srcbucket_mapjoin_n18;
analyze table srcbucket_mapjoin_n18 compute statistics for columns;
analyze table srcbucket_mapjoin_part_n20 compute statistics for columns;
analyze table tab_n10 compute statistics for columns;
analyze table tab_part_n11 compute statistics for columns;
set hive.auto.convert.join.noconditionaltask.size=3500;
set hive.convert.join.bucket.mapjoin.tez = false;
explain select a.key, b.key from tab_part_n11 a join tab_part_n11 c on a.key = c.key join tab_part_n11 b on a.value = b.value;
set hive.convert.join.bucket.mapjoin.tez = true;
explain select a.key, b.key from tab_part_n11 a join tab_part_n11 c on a.key = c.key join tab_part_n11 b on a.value = b.value;
CREATE TABLE tab1_n5(key int, value string) CLUSTERED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
insert overwrite table tab1_n5
select key,value from srcbucket_mapjoin_n18;
analyze table tab1_n5 compute statistics for columns;
-- A negative test as src is not bucketed.
set hive.auto.convert.join.noconditionaltask.size=12000;
set hive.convert.join.bucket.mapjoin.tez = false;
explain
select a.key, a.value, b.value
from tab1_n5 a join src b on a.key = b.key;
set hive.convert.join.bucket.mapjoin.tez = true;
explain
select a.key, a.value, b.value
from tab1_n5 a join src b on a.key = b.key;
set hive.auto.convert.join.noconditionaltask.size=2500;
set hive.convert.join.bucket.mapjoin.tez = false;
explain
select a.key, b.key from (select key from tab_part_n11 where key > 1) a join (select key from tab_part_n11 where key > 2) b on a.key = b.key;
set hive.convert.join.bucket.mapjoin.tez = true;
explain
select a.key, b.key from (select key from tab_part_n11 where key > 1) a join (select key from tab_part_n11 where key > 2) b on a.key = b.key;
set hive.convert.join.bucket.mapjoin.tez = false;
explain
select a.key, b.key from (select key from tab_part_n11 where key > 1) a left outer join (select key from tab_part_n11 where key > 2) b on a.key = b.key;
set hive.convert.join.bucket.mapjoin.tez = true;
explain
select a.key, b.key from (select key from tab_part_n11 where key > 1) a left outer join (select key from tab_part_n11 where key > 2) b on a.key = b.key;
set hive.convert.join.bucket.mapjoin.tez = false;
explain
select a.key, b.key from (select key from tab_part_n11 where key > 1) a right outer join (select key from tab_part_n11 where key > 2) b on a.key = b.key;
set hive.convert.join.bucket.mapjoin.tez = true;
explain
select a.key, b.key from (select key from tab_part_n11 where key > 1) a right outer join (select key from tab_part_n11 where key > 2) b on a.key = b.key;
set hive.auto.convert.join.noconditionaltask.size=2000;
set hive.convert.join.bucket.mapjoin.tez = false;
explain select a.key, b.key from (select distinct key from tab_n10) a join tab_n10 b on b.key = a.key;
set hive.convert.join.bucket.mapjoin.tez = true;
explain select a.key, b.key from (select distinct key from tab_n10) a join tab_n10 b on b.key = a.key;
set hive.convert.join.bucket.mapjoin.tez = false;
explain select a.value, b.value from (select distinct value from tab_n10) a join tab_n10 b on b.key = a.value;
set hive.convert.join.bucket.mapjoin.tez = true;
explain select a.value, b.value from (select distinct value from tab_n10) a join tab_n10 b on b.key = a.value;
--multi key
CREATE TABLE tab_part1 (key int, value string) PARTITIONED BY(ds STRING) CLUSTERED BY (key, value) INTO 4 BUCKETS STORED AS TEXTFILE;
insert overwrite table tab_part1 partition (ds='2008-04-08')
select key,value from srcbucket_mapjoin_part_n20;
analyze table tab_part1 compute statistics for columns;
set hive.auto.convert.join.noconditionaltask.size=12000;
set hive.convert.join.bucket.mapjoin.tez = false;
explain
select count(*)
from
(select distinct key,value from tab_part_n11) a join tab_n10 b on a.key = b.key and a.value = b.value;
set hive.convert.join.bucket.mapjoin.tez = true;
explain
select count(*)
from
(select distinct key,value from tab_part_n11) a join tab_n10 b on a.key = b.key and a.value = b.value;
--HIVE-17939
create table small (i int) stored as ORC;
create table big (i int) partitioned by (k int) clustered by (i) into 10 buckets stored as ORC;
insert into small values (1),(2),(3),(4),(5),(6);
insert into big partition(k=1) values(1),(3),(5),(7),(9);
insert into big partition(k=2) values(0),(2),(4),(6),(8);
explain select small.i, big.i from small,big where small.i=big.i;
select small.i, big.i from small,big where small.i=big.i order by small.i, big.i;
-- Bucket map join disabled for external tables
-- Create external table equivalent of tab_part_n11
CREATE EXTERNAL TABLE tab_part_ext (key int, value string) PARTITIONED BY(ds STRING) CLUSTERED BY (key) INTO 4 BUCKETS STORED AS TEXTFILE;
insert overwrite table tab_part_ext partition (ds='2008-04-08')
select key,value from srcbucket_mapjoin_part_n20;
analyze table tab_part_ext compute statistics for columns;
set hive.auto.convert.join.noconditionaltask.size=3500;
set hive.convert.join.bucket.mapjoin.tez = true;
set hive.disable.unsafe.external.table.operations=true;
set test.comment=Bucket map join should work here;
set test.comment;
explain select a.key, b.key from tab_part_n11 a join tab_part_n11 c on a.key = c.key join tab_part_n11 b on a.value = b.value;
set test.comment=External tables, bucket map join should be disabled;
set test.comment;
explain select a.key, b.key from tab_part_ext a join tab_part_ext c on a.key = c.key join tab_part_ext b on a.value = b.value;
-- HIVE-20187 : Must not create BMJ
create table my_fact(AMT decimal(20,3),bucket_col string ,join_col string )
PARTITIONED BY (FISCAL_YEAR string ,ACCOUNTING_PERIOD string )
CLUSTERED BY (bucket_col) INTO 10
BUCKETS
stored as ORC
;
create table my_dim(join_col string,filter_col string) stored as orc;
INSERT INTO my_dim VALUES("1", "VAL1"), ("2", "VAL2"), ("3", "VAL3"), ("4", "VAL4");
INSERT OVERWRITE TABLE my_fact PARTITION(FISCAL_YEAR="2015", ACCOUNTING_PERIOD="20") VALUES(1.11, "20", "1"), (1.11, "20", "1"), (1.12, "20", "2"), (1.12, "20", "3"), (1.12, "11", "3"), (1.12, "9", "3");
explain extended
select bucket_col, my_dim.join_col as account1,my_fact.accounting_period
FROM my_fact JOIN my_dim ON my_fact.join_col = my_dim.join_col
WHERE my_fact.fiscal_year = '2015'
AND my_dim.filter_col IN ( 'VAL1', 'VAL2' )
and my_fact.accounting_period in (10);
reset hive.llap.memory.oversubscription.max.executors.per.query;