HIVE-27658: Error resolving join keys during conversion to dynamic partition hashjoin (Stamatis Zampetakis reviewed by Denys Kuzmenko)
Sometimes when the compiler attempts to convert a Join to a
Dynamic Partition HashJoin (DPHJ) and certain assumptions about the
shape of the plan do not hold a SemanticException is thrown.
The DPHJ is a performance optimization so there is no reason to raise a
fatal error when the conversion cannot be performed. It is preferable
to simply skip the conversion and use a regular join instead of
blocking completely the query. The `MapJoinProcessor.getMapJoinDesc`
method already returns null in certain cases, so it is safe to add
another exit condition.
Overview of changes:
1. Return null when join key resolution fails and simply skip conversion
to DPHJ.
2. Log a warning instead of throwing a fatal SemanticException.
3. Enrich error message with more information to improve diagnosability.
Bringing the plan into a shape that will allow the DPHJ conversion is
still meaningful but can be tracked independently with other tickets.
Close apache/hive#4930
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
index e922ce4..adf4fbe 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
@@ -1306,7 +1306,8 @@ public static MapJoinDesc getMapJoinDesc(HiveConf hconf,
List<ExprNodeDesc> keyExprList =
ExprNodeDescUtils.resolveJoinKeysAsRSColumns(mapEntry.getValue(), rsParent);
if (keyExprList == null) {
- throw new SemanticException("Error resolving join keys");
+ LOG.warn("Error resolving join keys {} in {} {}", mapEntry.getValue(), rsParent, rsParent.getColumnExprMap());
+ return null;
}
newKeyExprMap.put(pos, keyExprList);
}
diff --git a/ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_4.q b/ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_4.q
new file mode 100644
index 0000000..d15307a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_4.q
@@ -0,0 +1,24 @@
+CREATE TABLE table_a (start_date date, product_id int);
+
+ALTER TABLE table_a UPDATE STATISTICS SET('numRows'='200000000','rawDataSize'='0' );
+ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' );
+ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='2500' );
+
+CREATE TABLE table_b (start_date date, product_id int, product_sk string);
+
+ALTER TABLE table_b UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='0' );
+ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' );
+ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='500' );
+ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET ('numDVs'='300000','numNulls'='0','avgColLen'='10','maxColLen'='10');
+
+set hive.optimize.dynamic.partition.hashjoin=true;
+set hive.auto.convert.join=true;
+set hive.auto.convert.join.noconditionaltask.size=180000000;
+
+EXPLAIN
+SELECT TC.CONST_DATE, TB.PRODUCT_SK
+FROM TABLE_A TA
+INNER JOIN (SELECT TO_DATE(FROM_UNIXTIME(1701088643)) AS CONST_DATE) TC
+ ON TA.START_DATE = TC.CONST_DATE
+INNER JOIN TABLE_B TB
+ ON TB.START_DATE = TC.CONST_DATE AND TA.PRODUCT_ID = TB.PRODUCT_ID;
diff --git a/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out b/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out
new file mode 100644
index 0000000..5ca79e2
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out
@@ -0,0 +1,210 @@
+PREHOOK: query: CREATE TABLE table_a (start_date date, product_id int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@table_a
+POSTHOOK: query: CREATE TABLE table_a (start_date date, product_id int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@table_a
+PREHOOK: query: ALTER TABLE table_a UPDATE STATISTICS SET('numRows'='200000000','rawDataSize'='0' )
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_a
+PREHOOK: Output: default@table_a
+POSTHOOK: query: ALTER TABLE table_a UPDATE STATISTICS SET('numRows'='200000000','rawDataSize'='0' )
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_a
+POSTHOOK: Output: default@table_a
+PREHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' )
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_a
+PREHOOK: Output: default@table_a
+POSTHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' )
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_a
+POSTHOOK: Output: default@table_a
+PREHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='2500' )
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_a
+PREHOOK: Output: default@table_a
+POSTHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='2500' )
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_a
+POSTHOOK: Output: default@table_a
+PREHOOK: query: CREATE TABLE table_b (start_date date, product_id int, product_sk string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@table_b
+POSTHOOK: query: CREATE TABLE table_b (start_date date, product_id int, product_sk string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@table_b
+PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='0' )
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_b
+PREHOOK: Output: default@table_b
+POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='0' )
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: default@table_b
+PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' )
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_b
+PREHOOK: Output: default@table_b
+POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' )
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: default@table_b
+PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='500' )
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_b
+PREHOOK: Output: default@table_b
+POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='500' )
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: default@table_b
+PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET ('numDVs'='300000','numNulls'='0','avgColLen'='10','maxColLen'='10')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@table_b
+PREHOOK: Output: default@table_b
+POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET ('numDVs'='300000','numNulls'='0','avgColLen'='10','maxColLen'='10')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@table_b
+POSTHOOK: Output: default@table_b
+PREHOOK: query: EXPLAIN
+SELECT TC.CONST_DATE, TB.PRODUCT_SK
+FROM TABLE_A TA
+INNER JOIN (SELECT TO_DATE(FROM_UNIXTIME(1701088643)) AS CONST_DATE) TC
+ ON TA.START_DATE = TC.CONST_DATE
+INNER JOIN TABLE_B TB
+ ON TB.START_DATE = TC.CONST_DATE AND TA.PRODUCT_ID = TB.PRODUCT_ID
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@table_a
+PREHOOK: Input: default@table_b
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT TC.CONST_DATE, TB.PRODUCT_SK
+FROM TABLE_A TA
+INNER JOIN (SELECT TO_DATE(FROM_UNIXTIME(1701088643)) AS CONST_DATE) TC
+ ON TA.START_DATE = TC.CONST_DATE
+INNER JOIN TABLE_B TB
+ ON TB.START_DATE = TC.CONST_DATE AND TA.PRODUCT_ID = TB.PRODUCT_ID
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@table_a
+POSTHOOK: Input: default@table_b
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Map 3 (BROADCAST_EDGE)
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: ta
+ filterExpr: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean)
+ Statistics: Num rows: 200000000 Data size: 12000000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean)
+ Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: product_id (type: int)
+ outputColumnNames: _col1
+ Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 DATE'2023-11-27' (type: date)
+ 1 DATE'2023-11-27' (type: date)
+ outputColumnNames: _col1, _col2
+ input vertices:
+ 1 Map 3
+ Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int), _col2 (type: date)
+ null sort order: zz
+ sort order: ++
+ Map-reduce partition columns: _col1 (type: int), _col2 (type: date)
+ Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: _dummy_table
+ Row Limit Per Split: 1
+ Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: DATE'2023-11-27' (type: date)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: DATE'2023-11-27' (type: date)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: DATE'2023-11-27' (type: date)
+ Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: date)
+ Execution mode: llap
+ LLAP IO: no inputs
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: tb
+ filterExpr: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean)
+ Statistics: Num rows: 100000000 Data size: 15400000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean)
+ Statistics: Num rows: 50000000 Data size: 7700000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: product_id (type: int), product_sk (type: string)
+ outputColumnNames: _col1, _col2
+ Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col1 (type: int), DATE'2023-11-27' (type: date)
+ null sort order: zz
+ sort order: ++
+ Map-reduce partition columns: _col1 (type: int), DATE'2023-11-27' (type: date)
+ Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: string)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int), _col2 (type: date)
+ 1 _col1 (type: int), DATE'2023-11-27' (type: date)
+ outputColumnNames: _col2, _col5
+ Statistics: Num rows: 16666666666 Data size: 2499999999900 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col2 (type: date), _col5 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+