blob: 0d4fcedc9726b54e9497929412aabd6de5586969 [file] [log] [blame]
--! qt:dataset:src
--! qt:dataset:part
set hive.exec.infer.bucket.sort=true;
-- Test writing to a bucketed table, the output should be bucketed by the bucketing key into the
-- a number of files equal to the number of buckets
CREATE TABLE test_table_bucketed (key STRING, value STRING) PARTITIONED BY (part STRING)
CLUSTERED BY (value) SORTED BY (value) INTO 3 BUCKETS;
-- Despite the fact that normally inferring would say this table is bucketed and sorted on key,
-- this should be bucketed and sorted by value into 3 buckets
INSERT OVERWRITE TABLE test_table_bucketed PARTITION (part = '1')
SELECT key, count(1) FROM src GROUP BY KEY;
DESCRIBE FORMATTED test_table_bucketed PARTITION (part = '1');
-- If the count(*) from sampling the buckets matches the count(*) from each file, the table is
-- bucketed
SELECT COUNT(*) FROM test_table_bucketed TABLESAMPLE (BUCKET 1 OUT OF 3) WHERE part = '1';
SELECT COUNT(*) FROM test_table_bucketed TABLESAMPLE (BUCKET 2 OUT OF 3) WHERE part = '1';
SELECT COUNT(*) FROM test_table_bucketed TABLESAMPLE (BUCKET 3 OUT OF 3) WHERE part = '1';
SELECT cnt FROM (SELECT INPUT__FILE__NAME, COUNT(*) cnt FROM test_table_bucketed WHERE part = '1'
GROUP BY INPUT__FILE__NAME ORDER BY INPUT__FILE__NAME ASC LIMIT 3) a;