blob: a65ed72c27ea1815916eed916116b35c38f0c5d5 [file] [log] [blame]
--! qt:dataset:src
set hive.stats.column.autogather=false;
set hive.mapred.mode=nonstrict;
set hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecutePrinter,org.apache.hadoop.hive.ql.hooks.PrintCompletedTasksHook;
create table smallTbl1(key string, value string);
insert overwrite table smallTbl1 select * from src where key < 10;
create table smallTbl2(key string, value string);
insert overwrite table smallTbl2 select * from src where key < 10;
create table smallTbl3(key string, value string);
insert overwrite table smallTbl3 select * from src where key < 10;
create table smallTbl4(key string, value string);
insert overwrite table smallTbl4 select * from src where key < 10;
create table bigTbl(key string, value string);
insert overwrite table bigTbl
select * from
(
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
union all
select * from src
) subq;
set hive.auto.convert.join=true;
explain
select count(*) FROM
(select bigTbl.key as key, bigTbl.value as value1,
bigTbl.value as value2 FROM bigTbl JOIN smallTbl1
on (bigTbl.key = smallTbl1.key)
) firstjoin
JOIN
smallTbl2 on (firstjoin.value1 = smallTbl2.value);
select count(*) FROM
(select bigTbl.key as key, bigTbl.value as value1,
bigTbl.value as value2 FROM bigTbl JOIN smallTbl1
on (bigTbl.key = smallTbl1.key)
) firstjoin
JOIN
smallTbl2 on (firstjoin.value1 = smallTbl2.value);
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=10000;
-- Now run a query with two-way join, which should be converted into a
-- map-join followed by groupby - two MR jobs overall
explain
select count(*) FROM
(select bigTbl.key as key, bigTbl.value as value1,
bigTbl.value as value2 FROM bigTbl JOIN smallTbl1
on (bigTbl.key = smallTbl1.key)
) firstjoin
JOIN
smallTbl2 on (firstjoin.value1 = smallTbl2.value);
select count(*) FROM
(select bigTbl.key as key, bigTbl.value as value1,
bigTbl.value as value2 FROM bigTbl JOIN smallTbl1
on (bigTbl.key = smallTbl1.key)
) firstjoin
JOIN
smallTbl2 on (firstjoin.value1 = smallTbl2.value);
-- Now run a query with two-way join, which should first be converted into a
-- map-join followed by groupby and then finally into a single MR job.
explain
select count(*) FROM
(select bigTbl.key as key, bigTbl.value as value1,
bigTbl.value as value2 FROM bigTbl JOIN smallTbl1
on (bigTbl.key = smallTbl1.key)
) firstjoin
JOIN
smallTbl2 on (firstjoin.value1 = smallTbl2.value)
group by smallTbl2.key;
select count(*) FROM
(select bigTbl.key as key, bigTbl.value as value1,
bigTbl.value as value2 FROM bigTbl JOIN smallTbl1
on (bigTbl.key = smallTbl1.key)
) firstjoin
JOIN
smallTbl2 on (firstjoin.value1 = smallTbl2.value)
group by smallTbl2.key;
drop table bigTbl;
create table bigTbl(key1 string, key2 string, value string);
insert overwrite table bigTbl
select * from
(
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
union all
select key as key1, key as key2, value from src
) subq;
set hive.auto.convert.join.noconditionaltask=false;
-- First disable noconditionaltask
EXPLAIN
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=10000;
-- Enable noconditionaltask and set the size of hive.auto.convert.join.noconditionaltask.size
-- to 10000, which is large enough to fit all four small tables (smallTbl1 to smallTbl4).
-- We will use a single MR job to evaluate this query.
EXPLAIN
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
set hive.auto.convert.join.noconditionaltask.size=200;
-- Enable noconditionaltask and set the size of hive.auto.convert.join.noconditionaltask.size
-- to 200, which is large enough to fit two small tables. We will have two jobs to evaluate this
-- query. The first job is a Map-only job to evaluate join1 and join2.
-- The second job will evaluate the rest of this query.
EXPLAIN
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
set hive.auto.convert.join.noconditionaltask.size=0;
-- Enable noconditionaltask and but set the size of hive.auto.convert.join.noconditionaltask.size
-- to 0. The plan will be the same as the one with a disabled nonconditionaltask.
EXPLAIN
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);
SELECT SUM(HASH(join3.key1)),
SUM(HASH(join3.key2)),
SUM(HASH(join3.key3)),
SUM(HASH(join3.key4)),
SUM(HASH(join3.key5)),
SUM(HASH(smallTbl4.key)),
SUM(HASH(join3.value1)),
SUM(HASH(join3.value2))
FROM (SELECT join2.key1 as key1,
join2.key2 as key2,
join2.key3 as key3,
join2.key4 as key4,
smallTbl3.key as key5,
join2.value1 as value1,
join2.value2 as value2
FROM (SELECT join1.key1 as key1,
join1.key2 as key2,
join1.key3 as key3,
smallTbl2.key as key4,
join1.value1 as value1,
join1.value2 as value2
FROM (SELECT bigTbl.key1 as key1,
bigTbl.key2 as key2,
smallTbl1.key as key3,
bigTbl.value as value1,
bigTbl.value as value2
FROM bigTbl
JOIN smallTbl1 ON (bigTbl.key1 = smallTbl1.key)) join1
JOIN smallTbl2 ON (join1.value1 = smallTbl2.value)) join2
JOIN smallTbl3 ON (join2.key2 = smallTbl3.key)) join3
JOIN smallTbl4 ON (join3.key3 = smallTbl4.key);