blob: 521157031c7f6946a87bd2284b5112c1409aecc9 [file] [log] [blame]
--! qt:dataset:src
set hive.vectorized.execution.enabled=false;
set hive.mapred.mode=nonstrict;
drop table if exists staging_n5;
drop table if exists parquet_jointable1;
drop table if exists parquet_jointable2;
drop table if exists parquet_jointable1_bucketed_sorted;
drop table if exists parquet_jointable2_bucketed_sorted;
create table staging_n5 (key int, value string) stored as textfile;
insert into table staging_n5 select distinct key, value from src order by key limit 2;
create table parquet_jointable1 stored as parquet as select * from staging_n5;
create table parquet_jointable2 stored as parquet as select key,key+1,concat(value,"value") as myvalue from staging_n5;
-- SORT_QUERY_RESULTS
-- MR join
explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
set hive.auto.convert.join=true;
-- The two tables involved in the join have differing number of columns(table1-2,table2-3). In case of Map and SMB join,
-- when the second table is loaded, the column indices in hive.io.file.readcolumn.ids refer to columns of both the first and the second table
-- and hence the parquet schema/types passed to ParquetInputSplit should contain only the column indexes belonging to second/current table
-- Map join
explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.join=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
-- SMB join
create table parquet_jointable1_bucketed_sorted (key int,value string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet;
insert overwrite table parquet_jointable1_bucketed_sorted select key,concat(value,"value1") as value from staging_n5 cluster by key;
create table parquet_jointable2_bucketed_sorted (key int,value1 string, value2 string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet;
insert overwrite table parquet_jointable2_bucketed_sorted select key,concat(value,"value2-1") as value1,concat(value,"value2-2") as value2 from staging_n5 cluster by key;
explain select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key;
select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key;