blob: 2823b15ca30e72163e88d66fedb080cf0c49ae7c [file] [log] [blame]
--! qt:dataset:srcpart
set hive.mapred.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
set hive.merge.smallfiles.avgsize=200;
set mapred.input.dir.recursive=true;
set hive.merge.mapfiles=false;
set hive.merge.mapredfiles=false;
set hive.stats.reliable=true;
-- SORT_QUERY_RESULTS
-- list bucketing DML: static partition. multiple skewed columns.
-- ds=2008-04-08/hr=11/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME:
-- 5263 000000_0
-- 5263 000001_0
-- ds=2008-04-08/hr=11/key=103/value=val_103:
-- 99 000000_0
-- 99 000001_0
-- ds=2008-04-08/hr=11/key=484/value=val_484:
-- 87 000000_0
-- 87 000001_0
-- create a skewed table
create table list_bucketing_static_part_n4 (key String, value String)
partitioned by (ds String, hr String)
skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
stored as DIRECTORIES
STORED AS RCFILE;
-- list bucketing DML without merge. use bucketize to generate a few small files.
explain extended
insert overwrite table list_bucketing_static_part_n4 partition (ds = '2008-04-08', hr = '11')
select key, value from srcpart where ds = '2008-04-08';
insert overwrite table list_bucketing_static_part_n4 partition (ds = '2008-04-08', hr = '11')
select key, value from srcpart where ds = '2008-04-08';
-- check DML result
show partitions list_bucketing_static_part_n4;
desc formatted list_bucketing_static_part_n4 partition (ds='2008-04-08', hr='11');
select count(1) from srcpart where ds = '2008-04-08';
select count(*) from list_bucketing_static_part_n4;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set hive.optimize.listbucketing=true;
explain extended
select * from list_bucketing_static_part_n4 where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
select * from list_bucketing_static_part_n4 where ds = '2008-04-08' and hr = '11' and key = '484' and value = 'val_484';
select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
-- 51 and val_51 in the table so skewed data for 51 and val_14 should be none
-- but query should succeed for 51 or 51 and val_14
select * from srcpart where ds = '2008-04-08' and key = '51';
select * from list_bucketing_static_part_n4 where key = '51';
select * from srcpart where ds = '2008-04-08' and key = '51' and value = 'val_14';
select * from list_bucketing_static_part_n4 where key = '51' and value = 'val_14';
-- queries with < <= > >= should work for skewed test although we don't benefit from pruning
select count(1) from srcpart where ds = '2008-04-08' and key < '51';
select count(1) from list_bucketing_static_part_n4 where key < '51';
select count(1) from srcpart where ds = '2008-04-08' and key <= '51';
select count(1) from list_bucketing_static_part_n4 where key <= '51';
select count(1) from srcpart where ds = '2008-04-08' and key > '51';
select count(1) from list_bucketing_static_part_n4 where key > '51';
select count(1) from srcpart where ds = '2008-04-08' and key >= '51';
select count(1) from list_bucketing_static_part_n4 where key >= '51';
-- clean up
drop table list_bucketing_static_part_n4;