ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q - hive - Git at Google

 --! qt:dataset:alltypesorc
 set hive.fetch.task.conversion=none;
 set hive.vectorized.use.row.serde.deserialize=true;
 set hive.vectorized.use.vector.serde.deserialize=true;
 set hive.vectorized.execution.enabled=true;
 set hive.vectorized.execution.reduce.enabled=true;

 -- SORT_QUERY_RESULTS

 create table if not exists alltypes_parquet_n0 (
   cint int,
   ctinyint tinyint,
   csmallint smallint,
   cfloat float,
   cdouble double,
   cstring1 string) stored as parquet;

 insert overwrite table alltypes_parquet_n0
   select cint,
     ctinyint,
     csmallint,
     cfloat,
     cdouble,
     cstring1
   from alltypesorc;

 -- test native fileinputformat vectorization

 explain vectorization select *
   from alltypes_parquet_n0
   where cint = 528534767
   limit 10;

 select *
   from alltypes_parquet_n0
   where cint = 528534767
   limit 10;

 explain vectorization select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_parquet_n0
   group by ctinyint;

 select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_parquet_n0
   group by ctinyint;

 -- exclude MapredParquetInputFormat from vectorization, this should cause mapwork vectorization to be disabled
 set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;

 explain vectorization select *
   from alltypes_parquet_n0
   where cint = 528534767
   limit 10;

 select *
   from alltypes_parquet_n0
   where cint = 528534767
   limit 10;

 explain vectorization select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_parquet_n0
   group by ctinyint;

 select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_parquet_n0
   group by ctinyint;


 -- unset hive.vectorized.input.format.excludes and confirm if vectorizer vectorizes the mapwork
 set hive.vectorized.input.format.excludes=;

 explain vectorization select *
   from alltypes_parquet_n0
   where cint = 528534767
   limit 10;

 select *
   from alltypes_parquet_n0
   where cint = 528534767
   limit 10;

 explain vectorization select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_parquet_n0
   group by ctinyint;

 select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_parquet_n0
   group by ctinyint;


 -- check if hive.vectorized.input.format.excludes work with non-parquet inputformats
 set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.orc.OrcInputFormat,org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
 set hive.vectorized.use.row.serde.deserialize=false;
 set hive.vectorized.use.vector.serde.deserialize=false;


 create table if not exists alltypes_orc_n2 (
   cint int,
   ctinyint tinyint,
   csmallint smallint,
   cfloat float,
   cdouble double,
   cstring1 string) stored as orc;

 insert overwrite table alltypes_orc_n2
   select cint,
     ctinyint,
     csmallint,
     cfloat,
     cdouble,
     cstring1
   from alltypesorc;

 explain vectorization select *
   from alltypes_orc_n2
   where cint = 528534767
   limit 10;

 select *
   from alltypes_orc_n2
   where cint = 528534767
   limit 10;

 explain vectorization select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_orc_n2
   group by ctinyint;

 select ctinyint,
   max(cint),
   min(csmallint),
   count(cstring1),
   avg(cfloat),
   stddev_pop(cdouble)
   from alltypes_orc_n2
   group by ctinyint;

 -- test when input format is excluded row serde is used for vectorization
 set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
 set hive.vectorized.use.vectorized.input.format=true;
 set hive.vectorized.use.row.serde.deserialize=true;
 set hive.vectorized.row.serde.inputformat.excludes=;

 create table orcTbl (t1 tinyint, t2 tinyint)
 stored as orc;

 insert into orcTbl values (54, 9), (-104, 25), (-112, 24);

 explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10;

 select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10;

 create table parquetTbl (t1 tinyint, t2 tinyint)
 stored as parquet;

 insert into parquetTbl values (54, 9), (-104, 25), (-112, 24);

 explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10;

 SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10;
	--! qt:dataset:alltypesorc
	set hive.fetch.task.conversion=none;
	set hive.vectorized.use.row.serde.deserialize=true;
	set hive.vectorized.use.vector.serde.deserialize=true;
	set hive.vectorized.execution.enabled=true;
	set hive.vectorized.execution.reduce.enabled=true;

	-- SORT_QUERY_RESULTS

	create table if not exists alltypes_parquet_n0 (
	cint int,
	ctinyint tinyint,
	csmallint smallint,
	cfloat float,
	cdouble double,
	cstring1 string) stored as parquet;

	insert overwrite table alltypes_parquet_n0
	select cint,
	ctinyint,
	csmallint,
	cfloat,
	cdouble,
	cstring1
	from alltypesorc;

	-- test native fileinputformat vectorization

	explain vectorization select *
	from alltypes_parquet_n0
	where cint = 528534767
	limit 10;

	select *
	from alltypes_parquet_n0
	where cint = 528534767
	limit 10;

	explain vectorization select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_parquet_n0
	group by ctinyint;

	select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_parquet_n0
	group by ctinyint;

	-- exclude MapredParquetInputFormat from vectorization, this should cause mapwork vectorization to be disabled
	set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;

	explain vectorization select *
	from alltypes_parquet_n0
	where cint = 528534767
	limit 10;

	select *
	from alltypes_parquet_n0
	where cint = 528534767
	limit 10;

	explain vectorization select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_parquet_n0
	group by ctinyint;

	select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_parquet_n0
	group by ctinyint;


	-- unset hive.vectorized.input.format.excludes and confirm if vectorizer vectorizes the mapwork
	set hive.vectorized.input.format.excludes=;

	explain vectorization select *
	from alltypes_parquet_n0
	where cint = 528534767
	limit 10;

	select *
	from alltypes_parquet_n0
	where cint = 528534767
	limit 10;

	explain vectorization select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_parquet_n0
	group by ctinyint;

	select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_parquet_n0
	group by ctinyint;


	-- check if hive.vectorized.input.format.excludes work with non-parquet inputformats
	set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.orc.OrcInputFormat,org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
	set hive.vectorized.use.row.serde.deserialize=false;
	set hive.vectorized.use.vector.serde.deserialize=false;


	create table if not exists alltypes_orc_n2 (
	cint int,
	ctinyint tinyint,
	csmallint smallint,
	cfloat float,
	cdouble double,
	cstring1 string) stored as orc;

	insert overwrite table alltypes_orc_n2
	select cint,
	ctinyint,
	csmallint,
	cfloat,
	cdouble,
	cstring1
	from alltypesorc;

	explain vectorization select *
	from alltypes_orc_n2
	where cint = 528534767
	limit 10;

	select *
	from alltypes_orc_n2
	where cint = 528534767
	limit 10;

	explain vectorization select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_orc_n2
	group by ctinyint;

	select ctinyint,
	max(cint),
	min(csmallint),
	count(cstring1),
	avg(cfloat),
	stddev_pop(cdouble)
	from alltypes_orc_n2
	group by ctinyint;

	-- test when input format is excluded row serde is used for vectorization
	set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
	set hive.vectorized.use.vectorized.input.format=true;
	set hive.vectorized.use.row.serde.deserialize=true;
	set hive.vectorized.row.serde.inputformat.excludes=;

	create table orcTbl (t1 tinyint, t2 tinyint)
	stored as orc;

	insert into orcTbl values (54, 9), (-104, 25), (-112, 24);

	explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10;

	select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10;

	create table parquetTbl (t1 tinyint, t2 tinyint)
	stored as parquet;

	insert into parquetTbl values (54, 9), (-104, 25), (-112, 24);

	explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10;

	SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10;