| set hive.mapred.mode=nonstrict; |
| |
| CREATE TABLE T1_n146(key INT, val STRING); |
| LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T1_n146; |
| CREATE TABLE T2_n86(key INT, val STRING); |
| LOAD DATA LOCAL INPATH '../../data/files/T2.txt' INTO TABLE T2_n86; |
| CREATE TABLE T3_n34(key INT, val STRING); |
| LOAD DATA LOCAL INPATH '../../data/files/T1.txt' INTO TABLE T3_n34; |
| |
| set hive.auto.convert.join=false; |
| set hive.optimize.correlation=false; |
| -- When Correlation Optimizer is turned off, this query will be evaluated |
| -- by 3 MR jobs. |
| -- When Correlation Optimizer is turned on, this query will be evaluated by |
| -- 2 MR jobs. The subquery tmp will be evaluated in a single MR job. |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x JOIN T1_n146 y ON (x.key = y.key) JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x JOIN T1_n146 y ON (x.key = y.key) JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x JOIN T1_n146 y ON (x.key = y.key) JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x JOIN T1_n146 y ON (x.key = y.key) JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| set hive.auto.convert.join=true; |
| -- Enable hive.auto.convert.join. |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x JOIN T1_n146 y ON (x.key = y.key) JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x JOIN T1_n146 y ON (x.key = y.key) JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.auto.convert.join=false; |
| set hive.optimize.correlation=false; |
| -- This case should be optimized, since the key of GroupByOperator is from the leftmost table |
| -- of a chain of LEFT OUTER JOINs. |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT x.key AS key, count(1) AS cnt |
| FROM T2_n86 x LEFT OUTER JOIN T1_n146 y ON (x.key = y.key) LEFT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY x.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT x.key AS key, count(1) AS cnt |
| FROM T2_n86 x LEFT OUTER JOIN T1_n146 y ON (x.key = y.key) LEFT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY x.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT x.key AS key, count(1) AS cnt |
| FROM T2_n86 x LEFT OUTER JOIN T1_n146 y ON (x.key = y.key) LEFT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY x.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT x.key AS key, count(1) AS cnt |
| FROM T2_n86 x LEFT OUTER JOIN T1_n146 y ON (x.key = y.key) LEFT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY x.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| -- This query will not be optimized by correlation optimizer because |
| -- GroupByOperator uses y.key (a right table of a left outer join) |
| -- as the key. |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x LEFT OUTER JOIN T1_n146 y ON (x.key = y.key) LEFT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x LEFT OUTER JOIN T1_n146 y ON (x.key = y.key) LEFT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.optimize.correlation=false; |
| -- This case should be optimized, since the key of GroupByOperator is from the rightmost table |
| -- of a chain of RIGHT OUTER JOINs. |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT z.key AS key, count(1) AS cnt |
| FROM T2_n86 x RIGHT OUTER JOIN T1_n146 y ON (x.key = y.key) RIGHT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY z.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT z.key AS key, count(1) AS cnt |
| FROM T2_n86 x RIGHT OUTER JOIN T1_n146 y ON (x.key = y.key) RIGHT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY z.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT z.key AS key, count(1) AS cnt |
| FROM T2_n86 x RIGHT OUTER JOIN T1_n146 y ON (x.key = y.key) RIGHT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY z.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT z.key AS key, count(1) AS cnt |
| FROM T2_n86 x RIGHT OUTER JOIN T1_n146 y ON (x.key = y.key) RIGHT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY z.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| -- This query will not be optimized by correlation optimizer because |
| -- GroupByOperator uses y.key (a left table of a right outer join) |
| -- as the key. |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x RIGHT OUTER JOIN T1_n146 y ON (x.key = y.key) RIGHT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x RIGHT OUTER JOIN T1_n146 y ON (x.key = y.key) RIGHT OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.optimize.correlation=false; |
| -- This case should not be optimized because afer the FULL OUTER JOIN, rows with null keys |
| -- are not grouped. |
| set hive.auto.convert.join=false; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.auto.convert.join=true; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| set hive.auto.convert.join=false; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| set hive.auto.convert.join=true; |
| EXPLAIN VECTORIZATION |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)) |
| FROM (SELECT y.key AS key, count(1) AS cnt |
| FROM T2_n86 x FULL OUTER JOIN T1_n146 y ON (x.key = y.key) FULL OUTER JOIN T3_n34 z ON (y.key = z.key) |
| GROUP BY y.key) tmp; |