ql/src/test/queries/clientpositive/correlationoptimizer14.q - hive - Git at Google

 --! qt:dataset:src1
 --! qt:dataset:src
 set hive.mapred.mode=nonstrict;
 set hive.optimize.reducededuplication=true;
 set hive.optimize.reducededuplication.min.reducer=1;
 set hive.optimize.correlation=true;
 set hive.remove.orderby.in.subquery=false;
 -- This file is used to show plans of queries involving cluster by, distribute by,
 -- order by, and sort by.
 -- Right now, Correlation optimizer check the most restrictive condition
 -- when determining if a ReduceSinkOperator is not necessary.
 -- This condition is that two ReduceSinkOperators should have same sorting columns,
 -- same partitioning columns, same sorting orders and no conflict on the numbers of reducers.

 -- Distribute by will not be optimized because distribute by does not introduce
 -- sorting columns.
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key) yy
 ON (xx.key=yy.key);

 -- Sort by will not be optimized because sort by does not introduce partitioning columns
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x SORT BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y SORT BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=false;
 -- Distribute by and sort by on the same key(s) should be optimized
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
 ON (xx.key=yy.key);

 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
 ON (xx.key=yy.key);

 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=true;
 -- Because for join we use ascending order, if sort by uses descending order,
 -- this query will not be optimized
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key DESC) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key DESC) yy
 ON (xx.key=yy.key);

 -- Even if hive.optimize.reducededuplication.min.reducer=1, order by will not be optimized
 -- because order by does not introduce partitioning columns
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x ORDER BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y ORDER BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=false;
 -- Cluster by will be optimized
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
 ON (xx.key=yy.key);

 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
 ON (xx.key=yy.key);

 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
 JOIN
 (SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=false;
 -- If hive.optimize.reducededuplication.min.reducer=1,
 -- group by and then order by should be optimized
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
 JOIN
 (SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
 ON (xx.key=yy.key);

 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
 JOIN
 (SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
 ON (xx.key=yy.key);

 set hive.optimize.correlation=true;
 EXPLAIN
 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
 JOIN
 (SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
 ON (xx.key=yy.key);

 SELECT xx.key, xx.value, yy.key, yy.value
 FROM
 (SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
 JOIN
 (SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
 ON (xx.key=yy.key);
	--! qt:dataset:src1
	--! qt:dataset:src
	set hive.mapred.mode=nonstrict;
	set hive.optimize.reducededuplication=true;
	set hive.optimize.reducededuplication.min.reducer=1;
	set hive.optimize.correlation=true;
	set hive.remove.orderby.in.subquery=false;
	-- This file is used to show plans of queries involving cluster by, distribute by,
	-- order by, and sort by.
	-- Right now, Correlation optimizer check the most restrictive condition
	-- when determining if a ReduceSinkOperator is not necessary.
	-- This condition is that two ReduceSinkOperators should have same sorting columns,
	-- same partitioning columns, same sorting orders and no conflict on the numbers of reducers.

	-- Distribute by will not be optimized because distribute by does not introduce
	-- sorting columns.
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key) yy
	ON (xx.key=yy.key);

	-- Sort by will not be optimized because sort by does not introduce partitioning columns
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x SORT BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y SORT BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=false;
	-- Distribute by and sort by on the same key(s) should be optimized
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
	ON (xx.key=yy.key);

	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
	ON (xx.key=yy.key);

	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=true;
	-- Because for join we use ascending order, if sort by uses descending order,
	-- this query will not be optimized
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x DISTRIBUTE BY key SORT BY key DESC) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y DISTRIBUTE BY key SORT BY key DESC) yy
	ON (xx.key=yy.key);

	-- Even if hive.optimize.reducededuplication.min.reducer=1, order by will not be optimized
	-- because order by does not introduce partitioning columns
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x ORDER BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y ORDER BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=false;
	-- Cluster by will be optimized
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
	ON (xx.key=yy.key);

	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
	ON (xx.key=yy.key);

	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x Cluster BY key) xx
	JOIN
	(SELECT y.key as key, y.value as value FROM src1 y Cluster BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=false;
	-- If hive.optimize.reducededuplication.min.reducer=1,
	-- group by and then order by should be optimized
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
	JOIN
	(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
	ON (xx.key=yy.key);

	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
	JOIN
	(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
	ON (xx.key=yy.key);

	set hive.optimize.correlation=true;
	EXPLAIN
	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
	JOIN
	(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
	ON (xx.key=yy.key);

	SELECT xx.key, xx.value, yy.key, yy.value
	FROM
	(SELECT x.key as key, x.value as value FROM src x CLUSTER BY key) xx
	JOIN
	(SELECT y.key as key, count(*) as value FROM src1 y GROUP BY y.key ORDER BY key) yy
	ON (xx.key=yy.key);