| --! qt:dataset:src |
| --! qt:dataset:part |
| set hive.mapred.mode=nonstrict; |
| set hive.exec.infer.bucket.sort=true; |
| |
| -- This tests inferring how data is bucketed/sorted from the operators in the reducer |
| -- and populating that information in partitions' metadata |
| |
| CREATE TABLE test_table_n5 (key STRING, value STRING) PARTITIONED BY (part STRING); |
| |
| -- Test group by, should be bucketed and sorted by group by key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, count(*) FROM src GROUP BY key; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by where a key isn't selected, should not be bucketed or sorted |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, count(*) FROM src GROUP BY key, value; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join, should be bucketed and sorted by join key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT a.key, a.value FROM src a JOIN src b ON a.key = b.key; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join with two keys, should be bucketed and sorted by join keys |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT a.key, a.value FROM src a JOIN src b ON a.key = b.key AND a.value = b.value; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join with two keys and only one selected, should not be bucketed or sorted |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT a.key, '1' FROM src a JOIN src b ON a.key = b.key AND a.value = b.value; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join on three tables on same key, should be bucketed and sorted by join key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT a.key, c.value FROM src a JOIN src b ON (a.key = b.key) JOIN src c ON (b.key = c.key); |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join on three tables on different keys, should be bucketed and sorted by latter key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT a.key, c.value FROM src a JOIN src b ON (a.key = b.key) JOIN src c ON (b.value = c.value); |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test distribute by, should only be bucketed by key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM src DISTRIBUTE BY key; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test sort by, should be sorted by key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM src SORT BY key ASC; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test sort by desc, should be sorted by key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM src SORT BY key DESC; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test cluster by, should be bucketed and sorted by key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM src CLUSTER BY key; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test distribute by and sort by different keys, should be bucketed by one key sorted by the other |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM src DISTRIBUTE BY key SORT BY value; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join in simple subquery, should be bucketed and sorted on key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value from (SELECT a.key, b.value FROM src a JOIN src b ON (a.key = b.key)) subq; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test join in simple subquery renaming key column, should be bucketed and sorted on key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT k, value FROM (SELECT a.key as k, b.value FROM src a JOIN src b ON (a.key = b.key)) subq; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in simple subquery, should be bucketed and sorted on key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, cnt from (SELECT key, count(*) as cnt FROM src GROUP BY key) subq; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in simple subquery renaming key column, should be bucketed and sorted on key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT k, cnt FROM (SELECT key as k, count(*) as cnt FROM src GROUP BY key) subq; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery with where outside, should still be bucketed and sorted on key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM (SELECT key, count(1) AS value FROM src group by key) a where key < 10; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery with expression on value, should still be bucketed and sorted on key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value + 1 FROM (SELECT key, count(1) AS value FROM src group by key) a where key < 10; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery with lateral view outside, should still be bucketed and sorted |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM (SELECT key FROM src group by key) a lateral view explode(array(1, 2)) value as value; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery with another group by outside, should be bucketed and sorted by the |
| -- key of the outer group by |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT count(1), value FROM (SELECT key, count(1) as value FROM src group by key) a group by value; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery with select on outside reordering the columns, should be bucketed and |
| -- sorted by the column the group by key ends up in |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT value, key FROM (SELECT key, count(1) as value FROM src group by key) a; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery followed by distribute by, should only be bucketed by the distribute key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM (SELECT key, count(1) as value FROM src group by key) a distribute by key; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery followed by sort by, should only be sorted by the sort key |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM (SELECT key, count(1) as value FROM src group by key) a sort by key; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by in subquery followed by transform script, should not be bucketed or sorted |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT TRANSFORM (a.key, a.value) USING 'cat' AS (key, value) FROM (SELECT key, count(1) AS value FROM src GROUP BY KEY) a; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |
| |
| -- Test group by on function, should be bucketed and sorted by key and value because the function is applied in the mapper |
| INSERT OVERWRITE TABLE test_table_n5 PARTITION (part = '1') |
| SELECT key, value FROM (SELECT concat(key, "a") AS key, value, count(*) FROM src GROUP BY concat(key, "a"), value) a; |
| |
| DESCRIBE FORMATTED test_table_n5 PARTITION (part = '1'); |