ql/src/test/queries/clientpositive/smb_mapjoin_15.q - hive - Git at Google

 --! qt:dataset:src

 -- MASK_STATS

 set hive.optimize.bucketmapjoin = true;
 set hive.optimize.bucketmapjoin.sortedmerge = true;
 set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;


 set hive.exec.reducers.max = 1;
 set hive.merge.mapfiles=false;
 set hive.merge.mapredfiles=false;

 -- This test verifies that the sort merge join optimizer works when the tables are sorted on columns which is a superset
 -- of join columns

 -- Create bucketed and sorted tables
 CREATE TABLE test_table1_n4 (key INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, value ASC) INTO 16 BUCKETS;
 CREATE TABLE test_table2_n4 (key INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, value ASC) INTO 16 BUCKETS;

 FROM src
 INSERT OVERWRITE TABLE test_table1_n4 SELECT *
 INSERT OVERWRITE TABLE test_table2_n4 SELECT *;

 -- it should be converted to a sort-merge join, since the first sort column (#join columns = 1) contains the join columns
 EXPLAIN EXTENDED
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key ORDER BY a.key LIMIT 10;
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key ORDER BY a.key LIMIT 10;

 DROP TABLE test_table1_n4;
 DROP TABLE test_table2_n4;

 -- Create bucketed and sorted tables
 CREATE TABLE test_table1_n4 (key INT, key2 INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, key2 ASC, value ASC) INTO 16 BUCKETS;
 CREATE TABLE test_table2_n4 (key INT, key2 INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, key2 ASC, value ASC) INTO 16 BUCKETS;

 FROM src
 INSERT OVERWRITE TABLE test_table1_n4 SELECT key, key, value
 INSERT OVERWRITE TABLE test_table2_n4 SELECT key, key, value;

 -- it should be converted to a sort-merge join, since the first 2 sort columns (#join columns = 2) contain the join columns
 EXPLAIN EXTENDED
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.key2 = b.key2 ORDER BY a.key LIMIT 10;
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.key2 = b.key2 ORDER BY a.key LIMIT 10;

 -- it should be converted to a sort-merge join, since the first 2 sort columns (#join columns = 2) contain the join columns
 -- even if the order is not the same
 EXPLAIN EXTENDED
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key2 = b.key2 and a.key = b.key ORDER BY a.key LIMIT 10;
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key2 = b.key2 and a.key = b.key ORDER BY a.key LIMIT 10;

 -- it should not be converted to a sort-merge join, since the first 2 sort columns (#join columns = 2) do not contain all
 -- the join columns
 EXPLAIN EXTENDED
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.value = b.value ORDER BY a.key LIMIT 10;
 SELECT /*+mapjoin(b)*/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.value = b.value ORDER BY a.key LIMIT 10;

 DROP TABLE test_table1_n4;
 DROP TABLE test_table2_n4;
	--! qt:dataset:src

	-- MASK_STATS

	set hive.optimize.bucketmapjoin = true;
	set hive.optimize.bucketmapjoin.sortedmerge = true;
	set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;


	set hive.exec.reducers.max = 1;
	set hive.merge.mapfiles=false;
	set hive.merge.mapredfiles=false;

	-- This test verifies that the sort merge join optimizer works when the tables are sorted on columns which is a superset
	-- of join columns

	-- Create bucketed and sorted tables
	CREATE TABLE test_table1_n4 (key INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, value ASC) INTO 16 BUCKETS;
	CREATE TABLE test_table2_n4 (key INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, value ASC) INTO 16 BUCKETS;

	FROM src
	INSERT OVERWRITE TABLE test_table1_n4 SELECT *
	INSERT OVERWRITE TABLE test_table2_n4 SELECT *;

	-- it should be converted to a sort-merge join, since the first sort column (#join columns = 1) contains the join columns
	EXPLAIN EXTENDED
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key ORDER BY a.key LIMIT 10;
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key ORDER BY a.key LIMIT 10;

	DROP TABLE test_table1_n4;
	DROP TABLE test_table2_n4;

	-- Create bucketed and sorted tables
	CREATE TABLE test_table1_n4 (key INT, key2 INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, key2 ASC, value ASC) INTO 16 BUCKETS;
	CREATE TABLE test_table2_n4 (key INT, key2 INT, value STRING) CLUSTERED BY (key) SORTED BY (key ASC, key2 ASC, value ASC) INTO 16 BUCKETS;

	FROM src
	INSERT OVERWRITE TABLE test_table1_n4 SELECT key, key, value
	INSERT OVERWRITE TABLE test_table2_n4 SELECT key, key, value;

	-- it should be converted to a sort-merge join, since the first 2 sort columns (#join columns = 2) contain the join columns
	EXPLAIN EXTENDED
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.key2 = b.key2 ORDER BY a.key LIMIT 10;
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.key2 = b.key2 ORDER BY a.key LIMIT 10;

	-- it should be converted to a sort-merge join, since the first 2 sort columns (#join columns = 2) contain the join columns
	-- even if the order is not the same
	EXPLAIN EXTENDED
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key2 = b.key2 and a.key = b.key ORDER BY a.key LIMIT 10;
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key2 = b.key2 and a.key = b.key ORDER BY a.key LIMIT 10;

	-- it should not be converted to a sort-merge join, since the first 2 sort columns (#join columns = 2) do not contain all
	-- the join columns
	EXPLAIN EXTENDED
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.value = b.value ORDER BY a.key LIMIT 10;
	SELECT /+mapjoin(b)/ * FROM test_table1_n4 a JOIN test_table2_n4 b ON a.key = b.key and a.value = b.value ORDER BY a.key LIMIT 10;

	DROP TABLE test_table1_n4;
	DROP TABLE test_table2_n4;