| --! qt:dataset:src |
| set hive.stats.column.autogather=false; |
| set hive.mapred.mode=nonstrict; |
| set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecutePrinter,org.apache.hadoop.hive.ql.hooks.PrintCompletedTasksHook; |
| |
| create table smallTbl1(key string, value string); |
| insert overwrite table smallTbl1 select * from src where key < 10; |
| |
| create table smallTbl2(key string, value string); |
| insert overwrite table smallTbl2 select * from src where key < 10; |
| |
| create table smallTbl3(key string, value string); |
| insert overwrite table smallTbl3 select * from src where key < 10; |
| |
| create table smallTbl4(key string, value string); |
| insert overwrite table smallTbl4 select * from src where key < 10; |
| |
| create table bigTbl(key string, value string); |
| insert overwrite table bigTbl |
| select * from |
| ( |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| union all |
| select * from src |
| ) subq; |
| |
| set hive.auto.convert.join=true; |
| |
| explain |
| select count(*) FROM |
| (select bigTbl.key as key, bigTbl.value as value1, |
| bigTbl.value as value2 FROM bigTbl JOIN smallTbl1 |
| on (bigTbl.key = smallTbl1.key) |
| ) firstjoin |
| JOIN |
| smallTbl2 on (firstjoin.value1 = smallTbl2.value); |
| |
| select count(*) FROM |
| (select bigTbl.key as key, bigTbl.value as value1, |
| bigTbl.value as value2 FROM bigTbl JOIN smallTbl1 |
| on (bigTbl.key = smallTbl1.key) |
| ) firstjoin |
| JOIN |
| smallTbl2 on (firstjoin.value1 = smallTbl2.value); |
| |
| set hive.auto.convert.join.noconditionaltask=true; |
| set hive.auto.convert.join.noconditionaltask.size=10000; |
| |
| -- Now run a query with two-way join, which should be converted into a |
| -- map-join followed by groupby - two MR jobs overall |
| explain |
| select count(*) FROM |
| (select bigTbl.key as key, bigTbl.value as value1, |
| bigTbl.value as value2 FROM bigTbl JOIN smallTbl1 |
| on (bigTbl.key = smallTbl1.key) |
| ) firstjoin |
| JOIN |
| smallTbl2 on (firstjoin.value1 = smallTbl2.value); |
| |
| select count(*) FROM |
| (select bigTbl.key as key, bigTbl.value as value1, |
| bigTbl.value as value2 FROM bigTbl JOIN smallTbl1 |
| on (bigTbl.key = smallTbl1.key) |
| ) firstjoin |
| JOIN |
| smallTbl2 on (firstjoin.value1 = smallTbl2.value); |
| |
| -- Now run a query with two-way join, which should first be converted into a |
| -- map-join followed by groupby and then finally into a single MR job. |
| |
| explain |
| select count(*) FROM |
| (select bigTbl.key as key, bigTbl.value as value1, |
| bigTbl.value as value2 FROM bigTbl JOIN smallTbl1 |
| on (bigTbl.key = smallTbl1.key) |
| ) firstjoin |
| JOIN |
| smallTbl2 on (firstjoin.value1 = smallTbl2.value) |
| group by smallTbl2.key; |
| |
| select count(*) FROM |
| (select bigTbl.key as key, bigTbl.value as value1, |
| bigTbl.value as value2 FROM bigTbl JOIN smallTbl1 |
| on (bigTbl.key = smallTbl1.key) |
| ) firstjoin |
| JOIN |
| smallTbl2 on (firstjoin.value1 = smallTbl2.value) |
| group by smallTbl2.key; |
| |
| drop table bigTbl; |
| |
| create table bigTbl(key1 string, key2 string, value string); |
| insert overwrite table bigTbl |
| select * from |
| ( |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| union all |
| select key as key1, key as key2, value from src |
| ) subq; |
| |
| set hive.auto.convert.join.noconditionaltask=false; |
| -- First disable noconditionaltask |
| EXPLAIN |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| set hive.auto.convert.join.noconditionaltask=true; |
| set hive.auto.convert.join.noconditionaltask.size=10000; |
| -- Enable noconditionaltask and set the size of hive.auto.convert.join.noconditionaltask.size |
| -- to 10000, which is large enough to fit all four small tables (smallTbl1 to smallTbl4). |
| -- We will use a single MR job to evaluate this query. |
| EXPLAIN |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| set hive.auto.convert.join.noconditionaltask.size=200; |
| -- Enable noconditionaltask and set the size of hive.auto.convert.join.noconditionaltask.size |
| -- to 200, which is large enough to fit two small tables. We will have two jobs to evaluate this |
| -- query. The first job is a Map-only job to evaluate join1 and join2. |
| -- The second job will evaluate the rest of this query. |
| EXPLAIN |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| set hive.auto.convert.join.noconditionaltask.size=0; |
| -- Enable noconditionaltask and but set the size of hive.auto.convert.join.noconditionaltask.size |
| -- to 0. The plan will be the same as the one with a disabled nonconditionaltask. |
| EXPLAIN |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |
| |
| SELECT SUM(HASH(join3.key1)), |
| SUM(HASH(join3.key2)), |
| SUM(HASH(join3.key3)), |
| SUM(HASH(join3.key4)), |
| SUM(HASH(join3.key5)), |
| SUM(HASH(smallTbl4.key)), |
| SUM(HASH(join3.value1)), |
| SUM(HASH(join3.value2)) |
| FROM (SELECT join2.key1 as key1, |
| join2.key2 as key2, |
| join2.key3 as key3, |
| join2.key4 as key4, |
| smallTbl3.key as key5, |
| join2.value1 as value1, |
| join2.value2 as value2 |
| FROM (SELECT join1.key1 as key1, |
| join1.key2 as key2, |
| join1.key3 as key3, |
| smallTbl2.key as key4, |
| join1.value1 as value1, |
| join1.value2 as value2 |
| FROM (SELECT bigTbl.key1 as key1, |
| bigTbl.key2 as key2, |
| smallTbl1.key as key3, |
| bigTbl.value as value1, |
| bigTbl.value as value2 |
| FROM bigTbl |
| JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1 |
| JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2 |
| JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3 |
| JOIN smallTbl4 ON (join3.key3 = smallTbl4.key); |