ql/src/test/queries/clientpositive/correlationoptimizer10.q - hive - Git at Google

 --! qt:dataset:src1
 --! qt:dataset:src
 set hive.mapred.mode=nonstrict;
 set hive.auto.convert.join=false;
 set hive.optimize.correlation=false;
 -- When Correlation Optimizer is turned off, 4 MR jobs are needed.
 -- When Correlation Optimizer is turned on, 2 MR jobs are needed.
 -- The first job will evaluate subquery xx and xx join yy.
 -- This case is used to test LEFT SEMI JOIN since Hive will
 -- introduce a GroupByOperator before the ReduceSinkOperator of
 -- the right table (yy in queries below)
 -- of LEFT SEMI JOIN.

 -- SORT_AND_HASH_QUERY_RESULTS

 EXPLAIN
 SELECT xx.key, xx.cnt
 FROM
 (SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
 LEFT SEMI JOIN src yy
 ON xx.key=yy.key;

 SELECT xx.key, xx.cnt
 FROM
 (SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
 LEFT SEMI JOIN src yy
 ON xx.key=yy.key;

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT xx.key, xx.cnt
 FROM
 (SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
 LEFT SEMI JOIN src yy
 ON xx.key=yy.key;

 SELECT xx.key, xx.cnt
 FROM
 (SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
 LEFT SEMI JOIN src yy
 ON xx.key=yy.key;

 set hive.optimize.correlation=false;
 -- When Correlation Optimizer is turned off, 4 MR jobs are needed.
 -- When Correlation Optimizer is turned on, 2 MR jobs are needed.
 -- The first job will evaluate subquery xx and xx join yy.
 -- This case is used to test LEFT SEMI JOIN since Hive will
 -- introduce a GroupByOperator before the ReduceSinkOperator of
 -- the right table (yy in queries below)
 -- of LEFT SEMI JOIN.
 EXPLAIN
 SELECT xx.key, xx.value
 FROM
 src1 xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND
        y.key > 20) yy
 ON xx.key=yy.key;

 SELECT xx.key, xx.value
 FROM
 src1 xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND
        y.key > 20) yy
 ON xx.key=yy.key;

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT xx.key, xx.value
 FROM
 src1 xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND
        y.key > 20) yy
 ON xx.key=yy.key;

 SELECT xx.key, xx.value
 FROM
 src1 xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND
        y.key > 20) yy
 ON xx.key=yy.key;

 set hive.optimize.correlation=false;
 -- When Correlation Optimizer is turned off, 4 MR jobs are needed.
 -- When Correlation Optimizer is turned on, 2 MR jobs are needed.
 -- This test is used to test if we can use shared scan for
 -- xx, yy:x, and yy:y.
 EXPLAIN
 SELECT xx.key, xx.value
 FROM
 src xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND x.key > 180) yy
 ON xx.key=yy.key;

 SELECT xx.key, xx.value
 FROM
 src xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND x.key > 180) yy
 ON xx.key=yy.key;

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT xx.key, xx.value
 FROM
 src xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND x.key > 180) yy
 ON xx.key=yy.key;

 SELECT xx.key, xx.value
 FROM
 src xx
 LEFT SEMI JOIN
 (SELECT x.key as key
  FROM src x JOIN src y ON (x.key = y.key)
  WHERE x.key < 200 AND x.key > 180) yy
 ON xx.key=yy.key;
	--! qt:dataset:src1
	--! qt:dataset:src
	set hive.mapred.mode=nonstrict;
	set hive.auto.convert.join=false;
	set hive.optimize.correlation=false;
	-- When Correlation Optimizer is turned off, 4 MR jobs are needed.
	-- When Correlation Optimizer is turned on, 2 MR jobs are needed.
	-- The first job will evaluate subquery xx and xx join yy.
	-- This case is used to test LEFT SEMI JOIN since Hive will
	-- introduce a GroupByOperator before the ReduceSinkOperator of
	-- the right table (yy in queries below)
	-- of LEFT SEMI JOIN.

	-- SORT_AND_HASH_QUERY_RESULTS

	EXPLAIN
	SELECT xx.key, xx.cnt
	FROM
	(SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
	LEFT SEMI JOIN src yy
	ON xx.key=yy.key;

	SELECT xx.key, xx.cnt
	FROM
	(SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
	LEFT SEMI JOIN src yy
	ON xx.key=yy.key;

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT xx.key, xx.cnt
	FROM
	(SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
	LEFT SEMI JOIN src yy
	ON xx.key=yy.key;

	SELECT xx.key, xx.cnt
	FROM
	(SELECT x.key as key, count(1) as cnt FROM src1 x JOIN src1 y ON (x.key = y.key) group by x.key) xx
	LEFT SEMI JOIN src yy
	ON xx.key=yy.key;

	set hive.optimize.correlation=false;
	-- When Correlation Optimizer is turned off, 4 MR jobs are needed.
	-- When Correlation Optimizer is turned on, 2 MR jobs are needed.
	-- The first job will evaluate subquery xx and xx join yy.
	-- This case is used to test LEFT SEMI JOIN since Hive will
	-- introduce a GroupByOperator before the ReduceSinkOperator of
	-- the right table (yy in queries below)
	-- of LEFT SEMI JOIN.
	EXPLAIN
	SELECT xx.key, xx.value
	FROM
	src1 xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND
	y.key > 20) yy
	ON xx.key=yy.key;

	SELECT xx.key, xx.value
	FROM
	src1 xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND
	y.key > 20) yy
	ON xx.key=yy.key;

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT xx.key, xx.value
	FROM
	src1 xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND
	y.key > 20) yy
	ON xx.key=yy.key;

	SELECT xx.key, xx.value
	FROM
	src1 xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND
	y.key > 20) yy
	ON xx.key=yy.key;

	set hive.optimize.correlation=false;
	-- When Correlation Optimizer is turned off, 4 MR jobs are needed.
	-- When Correlation Optimizer is turned on, 2 MR jobs are needed.
	-- This test is used to test if we can use shared scan for
	-- xx, yy:x, and yy:y.
	EXPLAIN
	SELECT xx.key, xx.value
	FROM
	src xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND x.key > 180) yy
	ON xx.key=yy.key;

	SELECT xx.key, xx.value
	FROM
	src xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND x.key > 180) yy
	ON xx.key=yy.key;

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT xx.key, xx.value
	FROM
	src xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND x.key > 180) yy
	ON xx.key=yy.key;

	SELECT xx.key, xx.value
	FROM
	src xx
	LEFT SEMI JOIN
	(SELECT x.key as key
	FROM src x JOIN src y ON (x.key = y.key)
	WHERE x.key < 200 AND x.key > 180) yy
	ON xx.key=yy.key;