ql/src/test/queries/clientpositive/orc_ppd_basic.q - hive - Git at Google

 --! qt:dataset:alltypesorc
 --! qt:replace:/(\s+HDFS_BYTES_WRITTEN:\s+)\S+(\s*)/$1#Masked#/
 --! qt:replace:/(\s+HDFS_BYTES_READ:\s+)\S+(\s*)/$1#Masked#/
 SET hive.vectorized.execution.enabled=false;
 set hive.compute.query.using.stats=false;
 set hive.mapred.mode=nonstrict;
 SET hive.fetch.task.conversion=none;
 SET hive.optimize.index.filter=true;
 SET hive.cbo.enable=false;
 SET hive.map.aggr=false;
 -- disabling map side aggregation as that can lead to different intermediate record counts

 CREATE TABLE staging_n7(t tinyint,
            si smallint,
            i int,
            b bigint,
            f float,
            d double,
            bo boolean,
            s string,
            ts timestamp,
            `dec` decimal(4,2),
            bin binary)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS TEXTFILE;

 LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE staging_n7;
 LOAD DATA LOCAL INPATH '../../data/files/over1k' INTO TABLE staging_n7;

 CREATE TABLE orc_ppd_staging_n1(t tinyint,
            si smallint,
            i int,
            b bigint,
            f float,
            d double,
            bo boolean,
            s string,
            c char(50),
            v varchar(50),
            da date,
            `dec` decimal(4,2),
            bin binary)
 STORED AS ORC tblproperties("orc.row.index.stride" = "1000", "orc.bloom.filter.columns"="*");

 insert overwrite table orc_ppd_staging_n1 select t, si, i, b, f, d, bo, s, cast(s as char(50)) as c,
 cast(s as varchar(50)) as v, cast(ts as date) as da, `dec`, bin from staging_n7 order by t, si, i, b, f, d, bo, s, c, v, da, `dec`, bin;

 -- just to introduce a gap in min/max range for bloom filters. The dataset has contiguous values
 -- which makes it hard to test bloom filters
 insert into orc_ppd_staging_n1 select -10,-321,-65680,-4294967430,-97.94,-13.07,true,"aaa","aaa","aaa","1990-03-11",-71.54,"aaa" from staging_n7 limit 1;
 insert into orc_ppd_staging_n1 select 127,331,65690,4294967440,107.94,23.07,true,"zzz","zzz","zzz","2023-03-11",71.54,"zzz" from staging_n7 limit 1;

 CREATE TABLE orc_ppd_n2(t tinyint,
            si smallint,
            i int,
            b bigint,
            f float,
            d double,
            bo boolean,
            s string,
            c char(50),
            v varchar(50),
            da date,
            `dec` decimal(4,2),
            bin binary)
 STORED AS ORC tblproperties("orc.row.index.stride" = "1000", "orc.bloom.filter.columns"="*");

 insert overwrite table orc_ppd_n2 select t, si, i, b, f, d, bo, s, cast(s as char(50)) as c,
 cast(s as varchar(50)) as v, da, `dec`, bin from orc_ppd_staging_n1 order by t, si, i, b, f, d, bo, s, c, v, da, `dec`, bin;


 SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecTezSummaryPrinter;

 -- Row group statistics for column t:
 -- Entry 0: count: 994 hasNull: true min: -10 max: 54 sum: 26014 positions: 0,0,0,0,0,0,0
 -- Entry 1: count: 1000 hasNull: false min: 54 max: 118 sum: 86812 positions: 0,2,124,0,0,116,11
 -- Entry 2: count: 100 hasNull: false min: 118 max: 127 sum: 12151 positions: 0,4,119,0,0,244,19

 -- INPUT_RECORDS: 2100 (all row groups)
 select count(*) from orc_ppd_n2;

 -- INPUT_RECORDS: 0 (no row groups)
 select count(*) from orc_ppd_n2 where t > 127;

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where t = 55;
 select count(*) from orc_ppd_n2 where t <=> 50;
 select count(*) from orc_ppd_n2 where t <=> 100;

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where t = "54";

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where t = -10.0;

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where t = cast(53 as float);
 select count(*) from orc_ppd_n2 where t = cast(53 as double);

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where t < 100;

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where t < 100 and t > 98;

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where t <= 100;

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where t is null;

 -- INPUT_RECORDS: 1100 (2 row groups)
 select count(*) from orc_ppd_n2 where t in (5, 120);

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where t between 60 and 80;

 -- bloom filter tests
 -- INPUT_RECORDS: 0
 select count(*) from orc_ppd_n2 where t = -100;
 select count(*) from orc_ppd_n2 where t <=> -100;
 select count(*) from orc_ppd_n2 where t = 125;
 select count(*) from orc_ppd_n2 where t IN (-100, 125, 200);

 -- Row group statistics for column s:
 -- Entry 0: count: 1000 hasNull: false min:  max: zach young sum: 12907 positions: 0,0,0
 -- Entry 1: count: 1000 hasNull: false min: alice allen max: zach zipper sum: 12704 positions: 0,1611,191
 -- Entry 2: count: 100 hasNull: false min: bob davidson max: zzz sum: 1281 positions: 0,3246,373

 -- INPUT_RECORDS: 0 (no row groups)
 select count(*) from orc_ppd_n2 where s > "zzz";

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where s = "zach young";
 select count(*) from orc_ppd_n2 where s <=> "zach zipper";
 select count(*) from orc_ppd_n2 where s <=> "";

 -- INPUT_RECORDS: 0
 select count(*) from orc_ppd_n2 where s is null;

 -- INPUT_RECORDS: 2100
 select count(*) from orc_ppd_n2 where s is not null;

 -- INPUT_RECORDS: 0
 select count(*) from orc_ppd_n2 where s = cast("zach young" as char(50));

 -- INPUT_RECORDS: 1000 (1 row group)
 select count(*) from orc_ppd_n2 where s = cast("zach young" as char(10));
 select count(*) from orc_ppd_n2 where s = cast("zach young" as varchar(10));
 select count(*) from orc_ppd_n2 where s = cast("zach young" as varchar(50));

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where s < "b";

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where s > "alice" and s < "bob";

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where s in ("alice allen", "");

 -- INPUT_RECORDS: 2000 (2 row groups)
 select count(*) from orc_ppd_n2 where s between "" and "alice allen";

 -- INPUT_RECORDS: 100 (1 row group)
 select count(*) from orc_ppd_n2 where s between "zz" and "zzz";

 -- INPUT_RECORDS: 1100 (2 row groups)
 select count(*) from orc_ppd_n2 where s between "zach zipper" and "zzz";

 -- bloom filter tests
 -- INPUT_RECORDS: 0
 select count(*) from orc_ppd_n2 where s = "hello world";
 select count(*) from orc_ppd_n2 where s <=> "apache hive";
 select count(*) from orc_ppd_n2 where s IN ("a", "z");

 -- INPUT_RECORDS: 100
 select count(*) from orc_ppd_n2 where s = "sarah ovid";

 -- INPUT_RECORDS: 1100
 select count(*) from orc_ppd_n2 where s = "wendy king";

 -- INPUT_RECORDS: 1000
 select count(*) from orc_ppd_n2 where s = "wendy king" and t < 0;

 -- INPUT_RECORDS: 100
 select count(*) from orc_ppd_n2 where s = "wendy king" and t > 100;

 set hive.cbo.enable=false;
 set hive.optimize.index.filter=false;
 -- when cbo is disabled constant gets converted to HiveDecimal
 --  74.72f + 0.0 = 74.72000122070312
 select count(*) from orc_ppd_n2 where f=74.72;
 set hive.optimize.index.filter=true;
 select count(*) from orc_ppd_n2 where f=74.72;

 set hive.cbo.enable=true;
 set hive.optimize.index.filter=false;
 select count(*) from orc_ppd_n2 where f=74.72;
 set hive.optimize.index.filter=true;
 select count(*) from orc_ppd_n2 where f=74.72;

 -- 42.47f + 0.0 == 42.470001220703125
 create temporary table orc_ppd_1 stored as orc as select * from orc_ppd_staging_n1 where d = 42.47;

 set hive.cbo.enable=false;
 set hive.optimize.index.filter=false;
 -- when cbo is disabled constant gets converted to HiveDecimal
 select count(*) from orc_ppd_1 where d=42.47;
 set hive.optimize.index.filter=true;
 select count(*) from orc_ppd_1 where d=42.47;

 set hive.cbo.enable=true;
 set hive.optimize.index.filter=false;
 select count(*) from orc_ppd_1 where d=42.47;
 set hive.optimize.index.filter=true;
 select count(*) from orc_ppd_1 where d=42.47;

 RESET;
 set hive.compute.query.using.stats=false;
 set hive.mapred.mode=nonstrict;
 SET hive.fetch.task.conversion=none;
 SET hive.optimize.index.filter=true;
 SET hive.cbo.enable=false;
 SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecOrcRowGroupCountPrinter;
 -- these tests include timestamp column that will impact the file size when tests run across
 -- different timezones. So we print only the selected row group count instead of entire tez exeuction summary.
 create temporary table tmp_orcppd
                     stored as orc
                     as select ctinyint, csmallint, cint , cbigint, cfloat, cdouble,
                               cstring1, cstring2, ctimestamp1, ctimestamp2
                     from alltypesorc limit 20;
 insert into table tmp_orcppd
                     values(null, null, null, null, null,
                            null, null, null, null, null);

 drop table if exists tbl_orcppd_1_1;

 create table tbl_orcppd_1_1 as
     select count(*) from tmp_orcppd
             where ctimestamp1> current_timestamp() and
             ctimestamp2 > current_timestamp() and
             cstring1 like 'a*' and
             cstring2 like 'a*';

 drop table if exists tmp_orcppd;

 create temporary table tmp_orcppd
                     stored as orc
                     as select ctimestamp1, ctimestamp2
                     from alltypesorc limit 10;
 insert into table tmp_orcppd
                     values(null,  null);

 drop table if exists tbl_orcppd_2_1;
 create table tbl_orcppd_2_1 as
         select count(*) from tmp_orcppd
                     where ctimestamp1 in (cast('2065-08-13 19:03:52' as timestamp), cast('2071-01-16 20:21:17' as timestamp), current_timestamp());
 set hive.optimize.index.filter=true;

 drop table if exists tmp_orcppd;
 create temporary table tmp_orcppd
                     stored as orc
                     as select ts, cast(ts as date)
                     from staging_n7 ;
 insert into table tmp_orcppd
                     values(null, null);

 drop table if exists tbl_orcppd_3_1;
 create table tbl_orcppd_3_1 as
    select count(*) from tmp_orcppd
             group by ts, cast(ts as date)
             having ts in (select ctimestamp1 from alltypesorc limit 10);
	--! qt:dataset:alltypesorc
	--! qt:replace:/(\s+HDFS_BYTES_WRITTEN:\s+)\S+(\s*)/$1#Masked#/
	--! qt:replace:/(\s+HDFS_BYTES_READ:\s+)\S+(\s*)/$1#Masked#/
	SET hive.vectorized.execution.enabled=false;
	set hive.compute.query.using.stats=false;
	set hive.mapred.mode=nonstrict;
	SET hive.fetch.task.conversion=none;
	SET hive.optimize.index.filter=true;
	SET hive.cbo.enable=false;
	SET hive.map.aggr=false;
	-- disabling map side aggregation as that can lead to different intermediate record counts

	CREATE TABLE staging_n7(t tinyint,
	si smallint,
	i int,
	b bigint,
	f float,
	d double,
	bo boolean,
	s string,
	ts timestamp,
	`dec` decimal(4,2),
	bin binary)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY '\|'
	STORED AS TEXTFILE;

	LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE staging_n7;
	LOAD DATA LOCAL INPATH '../../data/files/over1k' INTO TABLE staging_n7;

	CREATE TABLE orc_ppd_staging_n1(t tinyint,
	si smallint,
	i int,
	b bigint,
	f float,
	d double,
	bo boolean,
	s string,
	c char(50),
	v varchar(50),
	da date,
	`dec` decimal(4,2),
	bin binary)
	STORED AS ORC tblproperties("orc.row.index.stride" = "1000", "orc.bloom.filter.columns"="*");

	insert overwrite table orc_ppd_staging_n1 select t, si, i, b, f, d, bo, s, cast(s as char(50)) as c,
	cast(s as varchar(50)) as v, cast(ts as date) as da, `dec`, bin from staging_n7 order by t, si, i, b, f, d, bo, s, c, v, da, `dec`, bin;

	-- just to introduce a gap in min/max range for bloom filters. The dataset has contiguous values
	-- which makes it hard to test bloom filters
	insert into orc_ppd_staging_n1 select -10,-321,-65680,-4294967430,-97.94,-13.07,true,"aaa","aaa","aaa","1990-03-11",-71.54,"aaa" from staging_n7 limit 1;
	insert into orc_ppd_staging_n1 select 127,331,65690,4294967440,107.94,23.07,true,"zzz","zzz","zzz","2023-03-11",71.54,"zzz" from staging_n7 limit 1;

	CREATE TABLE orc_ppd_n2(t tinyint,
	si smallint,
	i int,
	b bigint,
	f float,
	d double,
	bo boolean,
	s string,
	c char(50),
	v varchar(50),
	da date,
	`dec` decimal(4,2),
	bin binary)
	STORED AS ORC tblproperties("orc.row.index.stride" = "1000", "orc.bloom.filter.columns"="*");

	insert overwrite table orc_ppd_n2 select t, si, i, b, f, d, bo, s, cast(s as char(50)) as c,
	cast(s as varchar(50)) as v, da, `dec`, bin from orc_ppd_staging_n1 order by t, si, i, b, f, d, bo, s, c, v, da, `dec`, bin;


	SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecTezSummaryPrinter;

	-- Row group statistics for column t:
	-- Entry 0: count: 994 hasNull: true min: -10 max: 54 sum: 26014 positions: 0,0,0,0,0,0,0
	-- Entry 1: count: 1000 hasNull: false min: 54 max: 118 sum: 86812 positions: 0,2,124,0,0,116,11
	-- Entry 2: count: 100 hasNull: false min: 118 max: 127 sum: 12151 positions: 0,4,119,0,0,244,19

	-- INPUT_RECORDS: 2100 (all row groups)
	select count(*) from orc_ppd_n2;

	-- INPUT_RECORDS: 0 (no row groups)
	select count(*) from orc_ppd_n2 where t > 127;

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where t = 55;
	select count(*) from orc_ppd_n2 where t <=> 50;
	select count(*) from orc_ppd_n2 where t <=> 100;

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where t = "54";

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where t = -10.0;

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where t = cast(53 as float);
	select count(*) from orc_ppd_n2 where t = cast(53 as double);

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where t < 100;

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where t < 100 and t > 98;

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where t <= 100;

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where t is null;

	-- INPUT_RECORDS: 1100 (2 row groups)
	select count(*) from orc_ppd_n2 where t in (5, 120);

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where t between 60 and 80;

	-- bloom filter tests
	-- INPUT_RECORDS: 0
	select count(*) from orc_ppd_n2 where t = -100;
	select count(*) from orc_ppd_n2 where t <=> -100;
	select count(*) from orc_ppd_n2 where t = 125;
	select count(*) from orc_ppd_n2 where t IN (-100, 125, 200);

	-- Row group statistics for column s:
	-- Entry 0: count: 1000 hasNull: false min: max: zach young sum: 12907 positions: 0,0,0
	-- Entry 1: count: 1000 hasNull: false min: alice allen max: zach zipper sum: 12704 positions: 0,1611,191
	-- Entry 2: count: 100 hasNull: false min: bob davidson max: zzz sum: 1281 positions: 0,3246,373

	-- INPUT_RECORDS: 0 (no row groups)
	select count(*) from orc_ppd_n2 where s > "zzz";

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where s = "zach young";
	select count(*) from orc_ppd_n2 where s <=> "zach zipper";
	select count(*) from orc_ppd_n2 where s <=> "";

	-- INPUT_RECORDS: 0
	select count(*) from orc_ppd_n2 where s is null;

	-- INPUT_RECORDS: 2100
	select count(*) from orc_ppd_n2 where s is not null;

	-- INPUT_RECORDS: 0
	select count(*) from orc_ppd_n2 where s = cast("zach young" as char(50));

	-- INPUT_RECORDS: 1000 (1 row group)
	select count(*) from orc_ppd_n2 where s = cast("zach young" as char(10));
	select count(*) from orc_ppd_n2 where s = cast("zach young" as varchar(10));
	select count(*) from orc_ppd_n2 where s = cast("zach young" as varchar(50));

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where s < "b";

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where s > "alice" and s < "bob";

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where s in ("alice allen", "");

	-- INPUT_RECORDS: 2000 (2 row groups)
	select count(*) from orc_ppd_n2 where s between "" and "alice allen";

	-- INPUT_RECORDS: 100 (1 row group)
	select count(*) from orc_ppd_n2 where s between "zz" and "zzz";

	-- INPUT_RECORDS: 1100 (2 row groups)
	select count(*) from orc_ppd_n2 where s between "zach zipper" and "zzz";

	-- bloom filter tests
	-- INPUT_RECORDS: 0
	select count(*) from orc_ppd_n2 where s = "hello world";
	select count(*) from orc_ppd_n2 where s <=> "apache hive";
	select count(*) from orc_ppd_n2 where s IN ("a", "z");

	-- INPUT_RECORDS: 100
	select count(*) from orc_ppd_n2 where s = "sarah ovid";

	-- INPUT_RECORDS: 1100
	select count(*) from orc_ppd_n2 where s = "wendy king";

	-- INPUT_RECORDS: 1000
	select count(*) from orc_ppd_n2 where s = "wendy king" and t < 0;

	-- INPUT_RECORDS: 100
	select count(*) from orc_ppd_n2 where s = "wendy king" and t > 100;

	set hive.cbo.enable=false;
	set hive.optimize.index.filter=false;
	-- when cbo is disabled constant gets converted to HiveDecimal
	-- 74.72f + 0.0 = 74.72000122070312
	select count(*) from orc_ppd_n2 where f=74.72;
	set hive.optimize.index.filter=true;
	select count(*) from orc_ppd_n2 where f=74.72;

	set hive.cbo.enable=true;
	set hive.optimize.index.filter=false;
	select count(*) from orc_ppd_n2 where f=74.72;
	set hive.optimize.index.filter=true;
	select count(*) from orc_ppd_n2 where f=74.72;

	-- 42.47f + 0.0 == 42.470001220703125
	create temporary table orc_ppd_1 stored as orc as select * from orc_ppd_staging_n1 where d = 42.47;

	set hive.cbo.enable=false;
	set hive.optimize.index.filter=false;
	-- when cbo is disabled constant gets converted to HiveDecimal
	select count(*) from orc_ppd_1 where d=42.47;
	set hive.optimize.index.filter=true;
	select count(*) from orc_ppd_1 where d=42.47;

	set hive.cbo.enable=true;
	set hive.optimize.index.filter=false;
	select count(*) from orc_ppd_1 where d=42.47;
	set hive.optimize.index.filter=true;
	select count(*) from orc_ppd_1 where d=42.47;

	RESET;
	set hive.compute.query.using.stats=false;
	set hive.mapred.mode=nonstrict;
	SET hive.fetch.task.conversion=none;
	SET hive.optimize.index.filter=true;
	SET hive.cbo.enable=false;
	SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecOrcRowGroupCountPrinter;
	-- these tests include timestamp column that will impact the file size when tests run across
	-- different timezones. So we print only the selected row group count instead of entire tez exeuction summary.
	create temporary table tmp_orcppd
	stored as orc
	as select ctinyint, csmallint, cint , cbigint, cfloat, cdouble,
	cstring1, cstring2, ctimestamp1, ctimestamp2
	from alltypesorc limit 20;
	insert into table tmp_orcppd
	values(null, null, null, null, null,
	null, null, null, null, null);

	drop table if exists tbl_orcppd_1_1;

	create table tbl_orcppd_1_1 as
	select count(*) from tmp_orcppd
	where ctimestamp1> current_timestamp() and
	ctimestamp2 > current_timestamp() and
	cstring1 like 'a*' and
	cstring2 like 'a*';

	drop table if exists tmp_orcppd;

	create temporary table tmp_orcppd
	stored as orc
	as select ctimestamp1, ctimestamp2
	from alltypesorc limit 10;
	insert into table tmp_orcppd
	values(null, null);

	drop table if exists tbl_orcppd_2_1;
	create table tbl_orcppd_2_1 as
	select count(*) from tmp_orcppd
	where ctimestamp1 in (cast('2065-08-13 19:03:52' as timestamp), cast('2071-01-16 20:21:17' as timestamp), current_timestamp());
	set hive.optimize.index.filter=true;

	drop table if exists tmp_orcppd;
	create temporary table tmp_orcppd
	stored as orc
	as select ts, cast(ts as date)
	from staging_n7 ;
	insert into table tmp_orcppd
	values(null, null);

	drop table if exists tbl_orcppd_3_1;
	create table tbl_orcppd_3_1 as
	select count(*) from tmp_orcppd
	group by ts, cast(ts as date)
	having ts in (select ctimestamp1 from alltypesorc limit 10);