| --! qt:dataset:src1 |
| --! qt:dataset:src |
| set hive.mapred.mode=nonstrict; |
| set hive.auto.convert.join=false; |
| set hive.optimize.correlation=false; |
| -- When Correlation Optimizer is turned off, 5 MR jobs will be generated. |
| -- When Correlation Optimizer is turned on, the subquery tmp will be evalauted |
| -- in a single MR job (including the subquery b, the subquery d, and b join d). |
| -- At the reduce side of the MR job evaluating tmp, two operation paths |
| -- (for subquery b and d) have different depths. The path starting from subquery b |
| -- is JOIN->GBY->JOIN, which has a depth of 3. While, the path starting from subquery d |
| -- is JOIN->JOIN. We should be able to handle this case. |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT b.key AS key, b.cnt AS cnt, d.value AS value |
| FROM (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b |
| JOIN (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d |
| ON b.key = d.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT b.key AS key, b.cnt AS cnt, d.value AS value |
| FROM (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b |
| JOIN (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d |
| ON b.key = d.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT b.key AS key, b.cnt AS cnt, d.value AS value |
| FROM (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b |
| JOIN (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d |
| ON b.key = d.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT b.key AS key, b.cnt AS cnt, d.value AS value |
| FROM (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b |
| JOIN (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d |
| ON b.key = d.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| set hive.auto.convert.join=true; |
| -- Enable hive.auto.convert.join. |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT b.key AS key, b.cnt AS cnt, d.value AS value |
| FROM (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b |
| JOIN (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d |
| ON b.key = d.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT b.key AS key, b.cnt AS cnt, d.value AS value |
| FROM (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) b |
| JOIN (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) d |
| ON b.key = d.key) tmp; |
| |
| set hive.auto.convert.join=false; |
| set hive.optimize.correlation=false; |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT d.key AS key, d.cnt AS cnt, b.value as value |
| FROM (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) b |
| JOIN (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) d |
| ON b.key = d.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT d.key AS key, d.cnt AS cnt, b.value as value |
| FROM (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) b |
| JOIN (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) d |
| ON b.key = d.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT d.key AS key, d.cnt AS cnt, b.value as value |
| FROM (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) b |
| JOIN (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) d |
| ON b.key = d.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT d.key AS key, d.cnt AS cnt, b.value as value |
| FROM (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) b |
| JOIN (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) d |
| ON b.key = d.key) tmp; |
| |
| set hive.optimize.correlation=true; |
| set hive.auto.convert.join=true; |
| -- Enable hive.auto.convert.join. |
| EXPLAIN |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT d.key AS key, d.cnt AS cnt, b.value as value |
| FROM (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) b |
| JOIN (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) d |
| ON b.key = d.key) tmp; |
| |
| SELECT SUM(HASH(tmp.key)), SUM(HASH(tmp.cnt)), SUM(HASH(tmp.value)) |
| FROM (SELECT d.key AS key, d.cnt AS cnt, b.value as value |
| FROM (SELECT x.key, x.value FROM src1 x JOIN src y ON (x.key = y.key)) b |
| JOIN (SELECT x.key, count(1) AS cnt FROM src1 x JOIN src y ON (x.key = y.key) group by x.key) d |
| ON b.key = d.key) tmp; |