blob: 1429a6d51fbe5bdbfdc108c4b296a4d954b8ede8 [file] [log] [blame]
--! qt:dataset:src
set hive.vectorized.execution.enabled=false;
set hive.exec.orc.dictionary.key.size.threshold=-1;
-- Set the threshold to -1 to guarantee dictionary encoding is turned off
-- Tests that the data can be read back correctly when a string column is stored
-- without dictionary encoding
CREATE TABLE test_orc_n5 (key STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
-- should be single split
INSERT OVERWRITE TABLE test_orc_n5 SELECT key FROM src TABLESAMPLE (10 ROWS);
-- Test reading the column back
SELECT * FROM test_orc_n5;
ALTER TABLE test_orc_n5 SET SERDEPROPERTIES ('orc.stripe.size' = '1');
CREATE TABLE src_thousand(key STRING) STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '../../data/files/kv1kv2.cogroup.txt'
INTO TABLE src_thousand;
set hive.exec.orc.dictionary.key.size.threshold=0.5;
-- Add data to the table in such a way that alternate stripes encode the column
-- differently. Setting orc.stripe.size = 1 guarantees the stripes each have
-- 5000 rows. The first stripe will have 5 * 630 distinct rows and thus be
-- above the cutoff of 50% and will be direct encoded. The second stripe
-- will have 5 * 1 distinct rows and thus be under the cutoff and will be
-- dictionary encoded. The final stripe will have 630 out of 1000 and be
-- direct encoded.
INSERT OVERWRITE TABLE test_orc_n5
SELECT key FROM (
SELECT CONCAT("a", key) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("b", key) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("c", key) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("d", key) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("e", key) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("f", 1) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("g", 1) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("h", 1) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("i", 1) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("j", 1) AS key FROM src_thousand
UNION ALL
SELECT CONCAT("k", key) AS key FROM src_thousand
) a ORDER BY key LIMIT 11000;
SELECT SUM(HASH(key)) FROM test_orc_n5;