blob: df718fbc9ee0f67341d91fee0975e60c7c7e14f1 [file] [log] [blame]
set hive.strict.checks.bucketing=false;
create table tmp_smb_bucket_10(userid int, pageid int, postid int, type string) partitioned by (ds string) CLUSTERED BY (userid) SORTED BY (pageid, postid, type, userid) INTO 2 BUCKETS STORED AS RCFILE;
alter table tmp_smb_bucket_10 add partition (ds = '1');
alter table tmp_smb_bucket_10 add partition (ds = '2');
-- add dummy files to make sure that the number of files in each partition is same as number of buckets
load data local inpath '../../data/files/smb_rc1/000000_0' INTO TABLE tmp_smb_bucket_10 partition(ds='1');
load data local inpath '../../data/files/smb_rc1/000001_0' INTO TABLE tmp_smb_bucket_10 partition(ds='1');
load data local inpath '../../data/files/smb_rc1/000000_0' INTO TABLE tmp_smb_bucket_10 partition(ds='2');
load data local inpath '../../data/files/smb_rc1/000001_0' INTO TABLE tmp_smb_bucket_10 partition(ds='2');
set hive.cbo.enable=false;
set hive.optimize.bucketmapjoin = true;
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
explain
select /*+mapjoin(a)*/ * from tmp_smb_bucket_10 a join tmp_smb_bucket_10 b
on (a.ds = '1' and b.ds = '2' and
a.userid = b.userid and
a.pageid = b.pageid and
a.postid = b.postid and
a.type = b.type);