ql/src/test/queries/clientpositive/acid_vectorization_original.q - hive - Git at Google

 set hive.mapred.mode=nonstrict;
 set hive.support.concurrency=true;
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

 set hive.vectorized.execution.enabled=true;
 -- enable ppd
 set hive.optimize.index.filter=true;

 set hive.explain.user=false;

 -- needed for TestCliDriver but not TestMiniTezCliDriver
 -- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

 -- attempts to get compaction to run - see below
 -- set yarn.scheduler.maximum-allocation-mb=2024;
 -- set hive.tez.container.size=500;
 -- set mapreduce.map.memory.mb=500;
 -- set mapreduce.reduce.memory.mb=500;
 -- set mapred.job.map.memory.mb=500;
 -- set mapred.job.reduce.memory.mb=500;


 CREATE TEMPORARY FUNCTION runWorker AS 'org.apache.hadoop.hive.ql.udf.UDFRunWorker';
 create table mydual(a int);
 insert into mydual values(1);

 CREATE TABLE over10k_n2(t tinyint,
            si smallint,
            i int,
            b bigint,
            f float,
            d double,
            bo boolean,
            s string,
            ts timestamp,
            `dec` decimal(4,2),
            bin binary)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS TEXTFILE;

 --oddly this has 9999 rows not > 10K
 LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k_n2;

 CREATE TABLE over10k_orc_bucketed(t tinyint,
            si smallint,
            i int,
            b bigint,
            f float,
            d double,
            bo boolean,
            s string,
            ts timestamp,
            `dec` decimal(4,2),
            bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC;

 -- this produces about 250 distinct values across all 4 equivalence classes
 select distinct si, si%4 from over10k_n2 order by si;

 -- explain insert into over10k_orc_bucketed select * from over10k_n2;
 insert into over10k_orc_bucketed select * from over10k_n2;

 dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed;
 -- create copy_N files
 insert into over10k_orc_bucketed select * from over10k_n2;

 -- this output of this is masked in .out - it is visible in .orig
 dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed;

 --this actually shows the data files in the .out on Tez but not LLAP
 select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed;

 -- convert table to acid
 alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true');

 -- this should vectorize (and push predicate to storage: filterExpr in TableScan )
 --             Execution mode: vectorized, llap
 --             LLAP IO: may be used (ACID table)
 explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i;
 select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by  t, si, i;

 -- this should vectorize (and push predicate to storage: filterExpr in TableScan )
 --             Execution mode: vectorized, llap
 --             LLAP IO: may be used (ACID table)
 explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;
 -- HIVE-17943
 select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;

 -- this should vectorize (and push predicate to storage: filterExpr in TableScan )
 --            Execution mode: vectorized, llap
 --            LLAP IO: may be used (ACID table)
 explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100;
 update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100;

 -- this should produce the same result (data) as previous time this exact query ran
 -- ROW__ID will be different (same bucketProperty)
 select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;

 -- The idea below was to do check sum queries to ensure that ROW__IDs are unique
 -- to run Compaction and to check that ROW__IDs are the same before and after compaction (for rows
 -- w/o applicable delete events)
 -- Everything below is commented out because it doesn't work
 -- group by ROW__ID produces wrong results
 -- compactor doesn't run - see error below...

 -- this doesn't vectorize but use llap which is perhaps the problem if it's using cache
 -- use explain VECTORIZATION DETAIL to see
 -- notVectorizedReason: Key expression for GROUPBY operator: Vectorizing complex type STRUCT not supported
 explain select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1;

 -- this test that there are no duplicate ROW__IDs so should produce no output
 -- on LLAP this produces "NULL, 6"; on tez it produces nothing: HIVE-17921
 -- this makes sure that the same code is running on the Ptest and on localhost. The target is:
 -- Original split count is 11 grouped split count is 1, for bucket: 1
 set tez.grouping.split-count=1;

 select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1;
 -- this produces nothing (as it should)
 select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;

 -- schedule compactor
 -- alter table over10k_orc_bucketed compact 'major' WITH OVERWRITE TBLPROPERTIES ("compactor.mapreduce.map.memory.mb"="500","compactor.hive.tez.container.size"="500");;


 -- run compactor - this currently fails with
 -- Invalid resource request, requested memory < 0, or requested memory > max configured, requestedMemory=1536, maxMemory=512
 -- HIVE-17922
 -- select runWorker() from mydual;

 -- show compactions;

 -- this should produce the same result (data+ ROW__ID) as previous time this exact query ran
 -- select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;

 -- this test that there are no duplicate ROW__IDs so should produce no output
 -- select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1;

 -- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;

 -- TODO: Remove this line after fixing HIVE-24351
 set hive.vectorized.execution.enabled=false;

 CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t between 3 and 4;

 -- Make sure there are multiple original files
 INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4;

 -- TODO: Remove this line after fixing HIVE-24351
 set hive.vectorized.execution.enabled=true;

 alter table over10k_orc set TBLPROPERTIES ('transactional'='true');

 -- row id is projected but there are no delete deltas
 set hive.exec.orc.split.strategy=ETL;
 select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
 on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;

 set hive.exec.orc.split.strategy=BI;
 select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
 on o1.ROW__ID.rowid == o2.ROW__ID.rowid
 and o1.ROW__ID.writeid == o2.ROW__ID.writeid
 and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;

 delete from over10k_orc where t = 3;

 -- row id not projected but has delete deltas
 set hive.exec.orc.split.strategy=ETL;
 select t, count(*) from over10k_orc
 group by t;

 set hive.exec.orc.split.strategy=BI;
 select t, count(*) from over10k_orc
 group by t;

 select oo.ROW__ID.writeId, oo.ROW__IS__DELETED, oo.* from over10k_orc('acid.fetch.deleted.rows'='true') oo order by si;
	set hive.mapred.mode=nonstrict;
	set hive.support.concurrency=true;
	set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

	set hive.vectorized.execution.enabled=true;
	-- enable ppd
	set hive.optimize.index.filter=true;

	set hive.explain.user=false;

	-- needed for TestCliDriver but not TestMiniTezCliDriver
	-- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

	-- attempts to get compaction to run - see below
	-- set yarn.scheduler.maximum-allocation-mb=2024;
	-- set hive.tez.container.size=500;
	-- set mapreduce.map.memory.mb=500;
	-- set mapreduce.reduce.memory.mb=500;
	-- set mapred.job.map.memory.mb=500;
	-- set mapred.job.reduce.memory.mb=500;



	CREATE TEMPORARY FUNCTION runWorker AS 'org.apache.hadoop.hive.ql.udf.UDFRunWorker';
	create table mydual(a int);
	insert into mydual values(1);

	CREATE TABLE over10k_n2(t tinyint,
	si smallint,
	i int,
	b bigint,
	f float,
	d double,
	bo boolean,
	s string,
	ts timestamp,
	`dec` decimal(4,2),
	bin binary)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY '\|'
	STORED AS TEXTFILE;

	--oddly this has 9999 rows not > 10K
	LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k_n2;

	CREATE TABLE over10k_orc_bucketed(t tinyint,
	si smallint,
	i int,
	b bigint,
	f float,
	d double,
	bo boolean,
	s string,
	ts timestamp,
	`dec` decimal(4,2),
	bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC;

	-- this produces about 250 distinct values across all 4 equivalence classes
	select distinct si, si%4 from over10k_n2 order by si;

	-- explain insert into over10k_orc_bucketed select * from over10k_n2;
	insert into over10k_orc_bucketed select * from over10k_n2;

	dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed;
	-- create copy_N files
	insert into over10k_orc_bucketed select * from over10k_n2;

	-- this output of this is masked in .out - it is visible in .orig
	dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed;

	--this actually shows the data files in the .out on Tez but not LLAP
	select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed;

	-- convert table to acid
	alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true');

	-- this should vectorize (and push predicate to storage: filterExpr in TableScan )
	-- Execution mode: vectorized, llap
	-- LLAP IO: may be used (ACID table)
	explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i;
	select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i;

	-- this should vectorize (and push predicate to storage: filterExpr in TableScan )
	-- Execution mode: vectorized, llap
	-- LLAP IO: may be used (ACID table)
	explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;
	-- HIVE-17943
	select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;

	-- this should vectorize (and push predicate to storage: filterExpr in TableScan )
	-- Execution mode: vectorized, llap
	-- LLAP IO: may be used (ACID table)
	explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100;
	update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100;

	-- this should produce the same result (data) as previous time this exact query ran
	-- ROW__ID will be different (same bucketProperty)
	select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;

	-- The idea below was to do check sum queries to ensure that ROW__IDs are unique
	-- to run Compaction and to check that ROW__IDs are the same before and after compaction (for rows
	-- w/o applicable delete events)
	-- Everything below is commented out because it doesn't work
	-- group by ROW__ID produces wrong results
	-- compactor doesn't run - see error below...

	-- this doesn't vectorize but use llap which is perhaps the problem if it's using cache
	-- use explain VECTORIZATION DETAIL to see
	-- notVectorizedReason: Key expression for GROUPBY operator: Vectorizing complex type STRUCT not supported
	explain select ROW__ID, count() from over10k_orc_bucketed group by ROW__ID having count() > 1;

	-- this test that there are no duplicate ROW__IDs so should produce no output
	-- on LLAP this produces "NULL, 6"; on tez it produces nothing: HIVE-17921
	-- this makes sure that the same code is running on the Ptest and on localhost. The target is:
	-- Original split count is 11 grouped split count is 1, for bucket: 1
	set tez.grouping.split-count=1;

	select ROW__ID, count() from over10k_orc_bucketed group by ROW__ID having count() > 1;
	-- this produces nothing (as it should)
	select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;

	-- schedule compactor
	-- alter table over10k_orc_bucketed compact 'major' WITH OVERWRITE TBLPROPERTIES ("compactor.mapreduce.map.memory.mb"="500","compactor.hive.tez.container.size"="500");;


	-- run compactor - this currently fails with
	-- Invalid resource request, requested memory < 0, or requested memory > max configured, requestedMemory=1536, maxMemory=512
	-- HIVE-17922
	-- select runWorker() from mydual;

	-- show compactions;

	-- this should produce the same result (data+ ROW__ID) as previous time this exact query ran
	-- select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID;

	-- this test that there are no duplicate ROW__IDs so should produce no output
	-- select ROW__ID, count() from over10k_orc_bucketed group by ROW__ID having count() > 1;

	-- select ROW__ID, * from over10k_orc_bucketed where ROW__ID is null;

	-- TODO: Remove this line after fixing HIVE-24351
	set hive.vectorized.execution.enabled=false;

	CREATE TABLE over10k_orc STORED AS ORC as select * from over10k_n2 where t between 3 and 4;

	-- Make sure there are multiple original files
	INSERT INTO over10k_orc select * from over10k_n2 where t between 3 and 4;

	-- TODO: Remove this line after fixing HIVE-24351
	set hive.vectorized.execution.enabled=true;

	alter table over10k_orc set TBLPROPERTIES ('transactional'='true');

	-- row id is projected but there are no delete deltas
	set hive.exec.orc.split.strategy=ETL;
	select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
	on o1.ROW__ID.rowid == o2.ROW__ID.rowid and o1.ROW__ID.writeid == o2.ROW__ID.writeid and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;

	set hive.exec.orc.split.strategy=BI;
	select o1.ROW__ID r1, o1.* from over10k_orc o1 join over10k_orc o2
	on o1.ROW__ID.rowid == o2.ROW__ID.rowid
	and o1.ROW__ID.writeid == o2.ROW__ID.writeid
	and o1.ROW__ID.bucketid == o2.ROW__ID.bucketid;

	delete from over10k_orc where t = 3;

	-- row id not projected but has delete deltas
	set hive.exec.orc.split.strategy=ETL;
	select t, count(*) from over10k_orc
	group by t;

	set hive.exec.orc.split.strategy=BI;
	select t, count(*) from over10k_orc
	group by t;

	select oo.ROW__ID.writeId, oo.ROW__IS__DELETED, oo.* from over10k_orc('acid.fetch.deleted.rows'='true') oo order by si;