ql/src/test/queries/clientpositive/vectorized_dynamic_semijoin_reduction.q - hive - Git at Google

 --! qt:dataset:src
 set hive.compute.query.using.stats=false;
 set hive.mapred.mode=nonstrict;
 set hive.explain.user=false;
 set hive.optimize.ppd=true;
 set hive.ppd.remove.duplicatefilters=true;
 set hive.tez.dynamic.partition.pruning=true;
 set hive.tez.dynamic.semijoin.reduction=true;
 set hive.optimize.metadataonly=false;
 set hive.optimize.index.filter=true;
 set hive.tez.bigtable.minsize.semijoin.reduction=1;
 set hive.tez.min.bloom.filter.entries=1;

 set hive.vectorized.adaptor.usage.mode=none;
 set hive.vectorized.execution.enabled=true;

 -- Create Tables
 create table dsrv_big stored as orc as select key as key_str, key as key_str2, cast(key as int) as key_int, cast(key as int) as key_int2, value from src;
 create table dsrv_small stored as orc as select distinct key as key_str, key as key_str2, cast(key as int) as key_int, cast(key as int) as key_int2, value from src where key < 100;

 -- single key (int)
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int);

 -- single key (string)
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str);

 -- keys are different type
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_str);

 -- multiple tables
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a, dsrv_small b, dsrv_small c where a.key_int = b.key_int and a.key_int = c.key_int;
 select count(*) from dsrv_big a, dsrv_small b, dsrv_small c where a.key_int = b.key_int and a.key_int = c.key_int;

 -- multiple keys
 SELECT 'use MurmurHashStringColIntCol';
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_int = b.key_int);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_int = b.key_int);

 SELECT 'use MurmurHashStringColIntCol, regardless of key ordering on join clause';
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_str = b.key_str);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_str = b.key_str);

 SELECT 'use MurmurHashIntColIntCol';
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_int2 = b.key_int2);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_int2 = b.key_int2);

 SELECT 'use MurmurHashStringColStringCol';
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_str2 = b.key_str2);
 select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_str2 = b.key_str2);

 -- small table result is empty
 EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int) where b.value in ('nonexistent1', 'nonexistent2');
 select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int) where b.value in ('nonexistent1', 'nonexistent2');

 -- bloomfilter expectedEntries should use ndv if available. Compare to first query
 analyze table dsrv_small compute statistics;
 analyze table dsrv_small compute statistics for columns;
 set hive.stats.fetch.column.stats=true;
 EXPLAIN select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int);

 drop table dsrv_big;
 drop table dsrv_small;
	--! qt:dataset:src
	set hive.compute.query.using.stats=false;
	set hive.mapred.mode=nonstrict;
	set hive.explain.user=false;
	set hive.optimize.ppd=true;
	set hive.ppd.remove.duplicatefilters=true;
	set hive.tez.dynamic.partition.pruning=true;
	set hive.tez.dynamic.semijoin.reduction=true;
	set hive.optimize.metadataonly=false;
	set hive.optimize.index.filter=true;
	set hive.tez.bigtable.minsize.semijoin.reduction=1;
	set hive.tez.min.bloom.filter.entries=1;

	set hive.vectorized.adaptor.usage.mode=none;
	set hive.vectorized.execution.enabled=true;

	-- Create Tables
	create table dsrv_big stored as orc as select key as key_str, key as key_str2, cast(key as int) as key_int, cast(key as int) as key_int2, value from src;
	create table dsrv_small stored as orc as select distinct key as key_str, key as key_str2, cast(key as int) as key_int, cast(key as int) as key_int2, value from src where key < 100;

	-- single key (int)
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int);

	-- single key (string)
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str);

	-- keys are different type
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_str);

	-- multiple tables
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a, dsrv_small b, dsrv_small c where a.key_int = b.key_int and a.key_int = c.key_int;
	select count(*) from dsrv_big a, dsrv_small b, dsrv_small c where a.key_int = b.key_int and a.key_int = c.key_int;

	-- multiple keys
	SELECT 'use MurmurHashStringColIntCol';
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_int = b.key_int);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_int = b.key_int);

	SELECT 'use MurmurHashStringColIntCol, regardless of key ordering on join clause';
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_str = b.key_str);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_str = b.key_str);

	SELECT 'use MurmurHashIntColIntCol';
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_int2 = b.key_int2);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int and a.key_int2 = b.key_int2);

	SELECT 'use MurmurHashStringColStringCol';
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_str2 = b.key_str2);
	select count(*) from dsrv_big a join dsrv_small b on (a.key_str = b.key_str and a.key_str2 = b.key_str2);

	-- small table result is empty
	EXPLAIN VECTORIZATION EXPRESSION select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int) where b.value in ('nonexistent1', 'nonexistent2');
	select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int) where b.value in ('nonexistent1', 'nonexistent2');

	-- bloomfilter expectedEntries should use ndv if available. Compare to first query
	analyze table dsrv_small compute statistics;
	analyze table dsrv_small compute statistics for columns;
	set hive.stats.fetch.column.stats=true;
	EXPLAIN select count(*) from dsrv_big a join dsrv_small b on (a.key_int = b.key_int);

	drop table dsrv_big;
	drop table dsrv_small;