[SPARK-48180][SQL] Improve error when UDTF call with TABLE arg forgets parentheses around multiple PARTITION/ORDER BY exprs
### What changes were proposed in this pull request?
This PR improves the error message when a table-valued function call has a TABLE argument with a PARTITION BY or ORDER BY clause with more than one associated expression, but forgets parentheses around them.
For example:
```
SELECT * FROM testUDTF(
TABLE(SELECT 1 AS device_id, 2 AS data_ds)
WITH SINGLE PARTITION
ORDER BY device_id, data_ds)
```
This query previously returned an obscure, unrelated error:
```
[UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT] Unsupported subquery expression: Table arguments are used in a function where they are not supported:
'UnresolvedTableValuedFunction [tvf], [table-argument#338 [], 'data_ds](https://issues.apache.org/jira/browse/SPARK-48180#338%20[],%20'data_ds), false
+- Project [1 AS device_id#336, 2 AS data_ds#337](https://issues.apache.org/jira/browse/SPARK-48180#336,%202%20AS%20data_ds#337)
+- OneRowRelation
```
Now it returns a reasonable error:
```
The table function call includes a table argument with an invalid partitioning/ordering specification: the ORDER BY clause included multiple expressions without parentheses surrounding them; please add parentheses around these expressions and then retry the query again. (line 4, pos 2)
== SQL ==
SELECT * FROM testUDTF(
TABLE(SELECT 1 AS device_id, 2 AS data_ds)
WITH SINGLE PARTITION
--^^^
ORDER BY device_id, data_ds)
```
### Why are the changes needed?
Here we improve error messages for common SQL syntax mistakes.
### Does this PR introduce _any_ user-facing change?
Yes, see above.
### How was this patch tested?
This PR adds test coverage.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #46451 from dtenedor/udtf-analyzer-bug.
Authored-by: Daniel Tenedorio <daniel.tenedorio@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index 653224c..249f55f 100644
--- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -838,9 +838,11 @@
: ((WITH SINGLE PARTITION)
| ((PARTITION | DISTRIBUTE) BY
(((LEFT_PAREN partition+=expression (COMMA partition+=expression)* RIGHT_PAREN))
+ | (expression (COMMA invalidMultiPartitionExpression=expression)+)
| partition+=expression)))
((ORDER | SORT) BY
(((LEFT_PAREN sortItem (COMMA sortItem)* RIGHT_PAREN)
+ | (sortItem (COMMA invalidMultiSortItem=sortItem)+)
| sortItem)))?
;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 7d2355b..326f1e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1638,6 +1638,20 @@
}
partitionByExpressions = p.partition.asScala.map(expression).toSeq
orderByExpressions = p.sortItem.asScala.map(visitSortItem).toSeq
+ def invalidPartitionOrOrderingExpression(clause: String): String = {
+ "The table function call includes a table argument with an invalid " +
+ s"partitioning/ordering specification: the $clause clause included multiple " +
+ "expressions without parentheses surrounding them; please add parentheses around " +
+ "these expressions and then retry the query again"
+ }
+ validate(
+ Option(p.invalidMultiPartitionExpression).isEmpty,
+ message = invalidPartitionOrOrderingExpression("PARTITION BY"),
+ ctx = p.invalidMultiPartitionExpression)
+ validate(
+ Option(p.invalidMultiSortItem).isEmpty,
+ message = invalidPartitionOrOrderingExpression("ORDER BY"),
+ ctx = p.invalidMultiSortItem)
}
validate(
!(withSinglePartition && partitionByExpressions.nonEmpty),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 17dd734..8d01040 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -1617,14 +1617,23 @@
parameters = Map(
"error" -> "'order'",
"hint" -> ""))
- val sql8 = s"select * from my_tvf(arg1 => table(select col1, col2, col3 from v2) " +
- s"$partition by col1, col2 order by col2 asc, col3 desc)"
+ val sql8tableArg = "table(select col1, col2, col3 from v2)"
+ val sql8partition = s"$partition by col1, col2 order by col2 asc, col3 desc"
+ val sql8 = s"select * from my_tvf(arg1 => $sql8tableArg $sql8partition)"
checkError(
exception = parseException(sql8),
- errorClass = "PARSE_SYNTAX_ERROR",
+ errorClass = "_LEGACY_ERROR_TEMP_0064",
parameters = Map(
- "error" -> "'order'",
- "hint" -> ": extra input 'order'"))
+ "msg" ->
+ ("The table function call includes a table argument with an invalid " +
+ "partitioning/ordering specification: the PARTITION BY clause included multiple " +
+ "expressions without parentheses surrounding them; please add parentheses around " +
+ "these expressions and then retry the query again")),
+ context = ExpectedContext(
+ fragment = s"$sql8tableArg $sql8partition",
+ start = 29,
+ stop = 110 + partition.length)
+ )
}
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
index 989597a..1eaf1d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
@@ -20,6 +20,7 @@
import org.apache.spark.api.python.PythonEvalType
import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row}
import org.apache.spark.sql.catalyst.expressions.{Add, Alias, Expression, FunctionTableSubqueryArgumentExpression, Literal}
+import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, OneRowRelation, Project, Repartition, RepartitionByExpression, Sort, SubqueryAlias}
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.internal.SQLConf
@@ -363,4 +364,29 @@
Row("abc"))
}
}
+
+ test("SPARK-48180: Analyzer bug with multiple ORDER BY items for input table argument") {
+ assume(shouldTestPythonUDFs)
+ spark.udtf.registerPython("testUDTF", pythonUDTF)
+ checkError(
+ exception = intercept[ParseException](sql(
+ """
+ |SELECT * FROM testUDTF(
+ | TABLE(SELECT 1 AS device_id, 2 AS data_ds)
+ | WITH SINGLE PARTITION
+ | ORDER BY device_id, data_ds)
+ |""".stripMargin)),
+ errorClass = "_LEGACY_ERROR_TEMP_0064",
+ parameters = Map("msg" ->
+ ("The table function call includes a table argument with an invalid " +
+ "partitioning/ordering specification: the ORDER BY clause included multiple " +
+ "expressions without parentheses surrounding them; please add parentheses around these " +
+ "expressions and then retry the query again")),
+ context = ExpectedContext(
+ fragment = "TABLE(SELECT 1 AS device_id, 2 AS data_ds)\n " +
+ "WITH SINGLE PARTITION\n " +
+ "ORDER BY device_id, data_ds",
+ start = 27,
+ stop = 122))
+ }
}