blob: 0785bca9d5c77d52f86de4f05875e7d77d5f5106 [file] [log] [blame]
--! qt:dataset:src
--! qt:dataset:part
set hive.mapred.mode=nonstrict;
set hive.exec.infer.bucket.sort=true;
-- This tests inferring how data is bucketed/sorted from the operators in the reducer
-- and populating that information in partitions' metadata
CREATE TABLE test_table_n5 (key STRING, value STRING) PARTITIONED BY (part STRING);
-- Test group by, should be bucketed and sorted by group by key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, count(*) FROM src GROUP BY key;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by where a key isn't selected, should not be bucketed or sorted
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, count(*) FROM src GROUP BY key, value;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join, should be bucketed and sorted by join key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT a.key, a.value FROM src a JOIN src b ON a.key = b.key;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join with two keys, should be bucketed and sorted by join keys
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT a.key, a.value FROM src a JOIN src b ON a.key = b.key AND a.value = b.value;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join with two keys and only one selected, should not be bucketed or sorted
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT a.key, '1' FROM src a JOIN src b ON a.key = b.key AND a.value = b.value;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join on three tables on same key, should be bucketed and sorted by join key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT a.key, c.value FROM src a JOIN src b ON (a.key = b.key) JOIN src c ON (b.key = c.key);
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join on three tables on different keys, should be bucketed and sorted by latter key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT a.key, c.value FROM src a JOIN src b ON (a.key = b.key) JOIN src c ON (b.value = c.value);
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test distribute by, should only be bucketed by key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM src DISTRIBUTE BY key;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test sort by, should be sorted by key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM src SORT BY key ASC;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test sort by desc, should be sorted by key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM src SORT BY key DESC;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test cluster by, should be bucketed and sorted by key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM src CLUSTER BY key;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test distribute by and sort by different keys, should be bucketed by one key sorted by the other
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM src DISTRIBUTE BY key SORT BY value;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join in simple subquery, should be bucketed and sorted on key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value from (SELECT a.key, b.value FROM src a JOIN src b ON (a.key = b.key)) subq;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test join in simple subquery renaming key column, should be bucketed and sorted on key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT k, value FROM (SELECT a.key as k, b.value FROM src a JOIN src b ON (a.key = b.key)) subq;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in simple subquery, should be bucketed and sorted on key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, cnt from (SELECT key, count(*) as cnt FROM src GROUP BY key) subq;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in simple subquery renaming key column, should be bucketed and sorted on key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT k, cnt FROM (SELECT key as k, count(*) as cnt FROM src GROUP BY key) subq;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery with where outside, should still be bucketed and sorted on key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM (SELECT key, count(1) AS value FROM src group by key) a where key < 10;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery with expression on value, should still be bucketed and sorted on key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value + 1 FROM (SELECT key, count(1) AS value FROM src group by key) a where key < 10;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery with lateral view outside, should still be bucketed and sorted
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM (SELECT key FROM src group by key) a lateral view explode(array(1, 2)) value as value;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery with another group by outside, should be bucketed and sorted by the
-- key of the outer group by
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT count(1), value FROM (SELECT key, count(1) as value FROM src group by key) a group by value;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery with select on outside reordering the columns, should be bucketed and
-- sorted by the column the group by key ends up in
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT value, key FROM (SELECT key, count(1) as value FROM src group by key) a;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery followed by distribute by, should only be bucketed by the distribute key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM (SELECT key, count(1) as value FROM src group by key) a distribute by key;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery followed by sort by, should only be sorted by the sort key
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM (SELECT key, count(1) as value FROM src group by key) a sort by key;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by in subquery followed by transform script, should not be bucketed or sorted
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT TRANSFORM (a.key, a.value) USING 'cat' AS (key, value) FROM (SELECT key, count(1) AS value FROM src GROUP BY KEY) a;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');
-- Test group by on function, should be bucketed and sorted by key and value because the function is applied in the mapper
INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1')
SELECT key, value FROM (SELECT concat(key, "a") AS key, value, count(*) FROM src GROUP BY concat(key, "a"), value) a;
DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1');