blob: 37d0176613cf08c86ffc85064b0a57430e6d3920 [file] [log] [blame]
set hive.mapred.mode=nonstrict;
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.vectorized.execution.enabled=true;
-- enable ppd
set hive.optimize.index.filter=true;
set hive.explain.user=false;
-- needed for TestCliDriver but not TestMiniTezCliDriver
-- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
-- attempts to get compaction to run - see below
-- set yarn.scheduler.maximum-allocation-mb=2024;
-- set hive.tez.container.size=500;
-- set mapreduce.map.memory.mb=500;
-- set mapreduce.reduce.memory.mb=500;
-- set mapred.job.map.memory.mb=500;
-- set mapred.job.reduce.memory.mb=500;
CREATE TEMPORARY FUNCTION runWorker AS 'org.apache.hadoop.hive.ql.udf.UDFRunWorker';
create table mydual(a int);
insert into mydual values(1);
CREATE TABLE over10k_n2(t tinyint,
si smallint,
i int,
b bigint,
f float,
d double,
bo boolean,
s string,
ts timestamp,
`dec` decimal(4,2),
bin binary)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;
--oddly this has 9999 rows not > 10K
LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k_n2;
CREATE TABLE over10k_orc_bucketed(t tinyint,
si smallint,
i int,
b bigint,
f float,
d double,
bo boolean,
s string,
ts timestamp,
`dec` decimal(4,2),
bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC;
-- this produces about 250 distinct values across all 4 equivalence classes
select distinct si, si%4 from over10k_n2 order by si;
-- explain insert into over10k_orc_bucketed select * from over10k_n2;
insert into over10k_orc_bucketed select * from over10k_n2;
dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed;
-- create copy_N files
insert into over10k_orc_bucketed select * from over10k_n2;
-- this output of this is masked in .out - it is visible in .orig
dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed;
--this actually shows the data files in the .out on Tez but not LLAP
select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed;
-- convert table to acid
alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true');
-- this should vectorize (and push predicate to storage: filterExpr in TableScan )
-- Execution mode: vectorized, llap
-- LLAP IO: may be used (ACID table)
explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i;
select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i;
-- this should vectorize (and push predicate to storage: filterExpr in TableScan )
-- Execution mode: vectorized, llap
-- LLAP IO: may be used (ACID table)
explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;
-- HIVE-17943
select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;
-- this should vectorize (and push predicate to storage: filterExpr in TableScan )
-- Execution mode: vectorized, llap
-- LLAP IO: may be used (ACID table)
explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100;
update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100;
-- this should produce the same result (data) as previous time this exact query ran
-- ROW__ID will be different (same bucketProperty)
select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;
-- The idea below was to do check sum queries to ensure that ROW__IDs are unique
-- to run Compaction and to check that ROW__IDs are the same before and after compaction (for rows
-- w/o applicable delete events)
-- Everything below is commented out because it doesn't work
-- group by ROW__ID produces wrong results
-- compactor doesn't run - see error below...
-- this doesn't vectorize but use llap which is perhaps the problem if it's using cache
-- use explain VECTORIZATION DETAIL to see
-- notVectorizedReason: Key expression for GROUPBY operator: Vectorizing complex type STRUCT not supported
explain select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1;
-- this test that there are no duplicate ROW__IDs so should produce no output
-- on LLAP this produces "NULL, 6"; on tez it produces nothing: HIVE-17921
-- this makes sure that the same code is running on the Ptest and on localhost. The target is:
-- Original split count is 11 grouped split count is 1, for bucket: 1
set tez.grouping.split-count=1;
select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1;
-- this produces nothing (as it should)
select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;
-- schedule compactor
-- alter table over10k_orc_bucketed compact 'major' WITH OVERWRITE TBLPROPERTIES ("compactor.mapreduce.map.memory.mb"="500","compactor.hive.tez.container.size"="500");;
-- run compactor - this currently fails with
-- Invalid resource request, requested memory < 0, or requested memory > max configured, requestedMemory=1536, maxMemory=512
-- HIVE-17922
-- select runWorker() from mydual;
-- show compactions;
-- this should produce the same result (data+ ROW__ID) as previous time this exact query ran
-- select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;
-- this test that there are no duplicate ROW__IDs so should produce no output
-- select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1;
-- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;
-- TODO: Remove this line after fixing HIVE-24351
set hive.vectorized.execution.enabled=false;
CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t between 3 and 4;
-- Make sure there are multiple original files
INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4;
-- TODO: Remove this line after fixing HIVE-24351
set hive.vectorized.execution.enabled=true;
alter table over10k_orc set TBLPROPERTIES ('transactional'='true');
-- row id is projected but there are no delete deltas
set hive.exec.orc.split.strategy=ETL;
select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;
set hive.exec.orc.split.strategy=BI;
select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
on o1.ROW__ID.rowid == o2.ROW__ID.rowid
and o1.ROW__ID.writeid == o2.ROW__ID.writeid
and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;
delete from over10k_orc where t = 3;
-- row id not projected but has delete deltas
set hive.exec.orc.split.strategy=ETL;
select t, count(*) from over10k_orc
group by t;
set hive.exec.orc.split.strategy=BI;
select t, count(*) from over10k_orc
group by t;
select oo.ROW__ID.writeId, oo.ROW__IS__DELETED, oo.* from over10k_orc('acid.fetch.deleted.rows'='true') oo order by si;