ql/src/test/queries/clientpositive/correlationoptimizer8.q - hive - Git at Google

 --! qt:dataset:src1
 --! qt:dataset:src
 set hive.mapred.mode=nonstrict;
 set hive.auto.convert.join=false;
 set hive.optimize.correlation=false;

 -- SORT_QUERY_RESULTS

 -- When the Correlation Optimizer is turned off, this query will be evaluated by
 -- 4 MR jobs.
 -- When the Correlation Optimizer is turned on, because both inputs of the
 -- UnionOperator are correlated, we can use 2 MR jobs to evaluate this query.
 -- The first MR job will evaluate subquery subq1 and subq1 join x. The second
 -- MR is for ordering.
 EXPLAIN
 SELECT x.key, x.value, subq1.cnt
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
 ) subq1
 JOIN src1 x ON (x.key = subq1.key);

 SELECT x.key, x.value, subq1.cnt
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
 ) subq1
 JOIN src1 x ON (x.key = subq1.key);

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT x.key, x.value, subq1.cnt
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
 ) subq1
 JOIN src1 x ON (x.key = subq1.key);

 SELECT x.key, x.value, subq1.cnt
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
 ) subq1
 JOIN src1 x ON (x.key = subq1.key);

 set hive.optimize.correlation=false;
 -- When the Correlation Optimizer is turned off, this query will be evaluated by
 -- 4 MR jobs.
 -- When the Correlation Optimizer is turned on, because both inputs of the
 -- UnionOperator are correlated, we can use 2 MR jobs to evaluate this query.
 -- The first MR job will evaluate subquery subq1 and subq1 join x. The second
 -- MR is for ordering.
 EXPLAIN
 SELECT subq1.key, subq1.cnt, x.key, x.value
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
 ) subq1
 LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

 SELECT subq1.key, subq1.cnt, x.key, x.value
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
 ) subq1
 LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT subq1.key, subq1.cnt, x.key, x.value
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
 ) subq1
 LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

 SELECT subq1.key, subq1.cnt, x.key, x.value
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
 ) subq1
 LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

 set hive.optimize.correlation=true;
 -- When the Correlation Optimizer is turned on, because a input of UnionOperator is
 -- not correlated, we cannot handle this case right now. So, this query will not be
 -- optimized.
 EXPLAIN
 SELECT x.key, x.value, subq1.cnt
 FROM
 ( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key, x1.value
 ) subq1
 JOIN src1 x ON (x.key = subq1.key);

 set hive.optimize.correlation=true;
 -- When the Correlation Optimizer is turned on, because a input of UnionOperator is
 -- not correlated, we cannot handle this case right now. So, this query will not be
 -- optimized.
 EXPLAIN
 SELECT subq1.key, subq1.value, x.key, x.value
 FROM
 ( SELECT cast(x.key as INT) as key, count(1) as value from src x where x.key < 20 group by x.key
      UNION ALL
   SELECT count(1) as key, cast(x1.key as INT) as value from src x1 where x1.key > 100 group by x1.key
 ) subq1
 FULL OUTER JOIN src1 x ON (x.key = subq1.key);
	--! qt:dataset:src1
	--! qt:dataset:src
	set hive.mapred.mode=nonstrict;
	set hive.auto.convert.join=false;
	set hive.optimize.correlation=false;

	-- SORT_QUERY_RESULTS

	-- When the Correlation Optimizer is turned off, this query will be evaluated by
	-- 4 MR jobs.
	-- When the Correlation Optimizer is turned on, because both inputs of the
	-- UnionOperator are correlated, we can use 2 MR jobs to evaluate this query.
	-- The first MR job will evaluate subquery subq1 and subq1 join x. The second
	-- MR is for ordering.
	EXPLAIN
	SELECT x.key, x.value, subq1.cnt
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
	) subq1
	JOIN src1 x ON (x.key = subq1.key);

	SELECT x.key, x.value, subq1.cnt
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
	) subq1
	JOIN src1 x ON (x.key = subq1.key);

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT x.key, x.value, subq1.cnt
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
	) subq1
	JOIN src1 x ON (x.key = subq1.key);

	SELECT x.key, x.value, subq1.cnt
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key
	) subq1
	JOIN src1 x ON (x.key = subq1.key);

	set hive.optimize.correlation=false;
	-- When the Correlation Optimizer is turned off, this query will be evaluated by
	-- 4 MR jobs.
	-- When the Correlation Optimizer is turned on, because both inputs of the
	-- UnionOperator are correlated, we can use 2 MR jobs to evaluate this query.
	-- The first MR job will evaluate subquery subq1 and subq1 join x. The second
	-- MR is for ordering.
	EXPLAIN
	SELECT subq1.key, subq1.cnt, x.key, x.value
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
	) subq1
	LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

	SELECT subq1.key, subq1.cnt, x.key, x.value
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
	) subq1
	LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT subq1.key, subq1.cnt, x.key, x.value
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
	) subq1
	LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

	SELECT subq1.key, subq1.cnt, x.key, x.value
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.value as key, count(1) as cnt from src1 x1 where x1.key > 100 group by x1.value
	) subq1
	LEFT OUTER JOIN src1 x ON (x.key = subq1.key);

	set hive.optimize.correlation=true;
	-- When the Correlation Optimizer is turned on, because a input of UnionOperator is
	-- not correlated, we cannot handle this case right now. So, this query will not be
	-- optimized.
	EXPLAIN
	SELECT x.key, x.value, subq1.cnt
	FROM
	( SELECT x.key as key, count(1) as cnt from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT x1.key as key, count(1) as cnt from src x1 where x1.key > 100 group by x1.key, x1.value
	) subq1
	JOIN src1 x ON (x.key = subq1.key);

	set hive.optimize.correlation=true;
	-- When the Correlation Optimizer is turned on, because a input of UnionOperator is
	-- not correlated, we cannot handle this case right now. So, this query will not be
	-- optimized.
	EXPLAIN
	SELECT subq1.key, subq1.value, x.key, x.value
	FROM
	( SELECT cast(x.key as INT) as key, count(1) as value from src x where x.key < 20 group by x.key
	UNION ALL
	SELECT count(1) as key, cast(x1.key as INT) as value from src x1 where x1.key > 100 group by x1.key
	) subq1
	FULL OUTER JOIN src1 x ON (x.key = subq1.key);