[SPARK-48180][SQL] Improve error when UDTF call with TABLE arg forgets parentheses around multiple PARTITION/ORDER BY exprs ### What changes were proposed in this pull request? This PR improves the error message when a table-valued function call has a TABLE argument with a PARTITION BY or ORDER BY clause with more than one associated expression, but forgets parentheses around them. For example: ``` SELECT * FROM testUDTF( TABLE(SELECT 1 AS device_id, 2 AS data_ds) WITH SINGLE PARTITION ORDER BY device_id, data_ds) ``` This query previously returned an obscure, unrelated error: ``` [UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_TABLE_ARGUMENT] Unsupported subquery expression: Table arguments are used in a function where they are not supported: 'UnresolvedTableValuedFunction [tvf], [table-argument#338 [], 'data_ds](https://issues.apache.org/jira/browse/SPARK-48180#338%20[],%20'data_ds), false +- Project [1 AS device_id#336, 2 AS data_ds#337](https://issues.apache.org/jira/browse/SPARK-48180#336,%202%20AS%20data_ds#337) +- OneRowRelation ``` Now it returns a reasonable error: ``` The table function call includes a table argument with an invalid partitioning/ordering specification: the ORDER BY clause included multiple expressions without parentheses surrounding them; please add parentheses around these expressions and then retry the query again. (line 4, pos 2) == SQL == SELECT * FROM testUDTF( TABLE(SELECT 1 AS device_id, 2 AS data_ds) WITH SINGLE PARTITION --^^^ ORDER BY device_id, data_ds) ``` ### Why are the changes needed? Here we improve error messages for common SQL syntax mistakes. ### Does this PR introduce _any_ user-facing change? Yes, see above. ### How was this patch tested? This PR adds test coverage. ### Was this patch authored or co-authored using generative AI tooling? No Closes #46451 from dtenedor/udtf-analyzer-bug. Authored-by: Daniel Tenedorio <daniel.tenedorio@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>

commit: 71f0eda71bc169a5245f4412ec0957728025a66c [log] [tgz]
author: Daniel Tenedorio <daniel.tenedorio@databricks.com> Fri May 10 08:44:07 2024 +0900
committer: Hyukjin Kwon <gurwls223@apache.org> Fri May 10 08:44:07 2024 +0900
tree: 41dc5e10e8f05f39dffe7d992121312aa6f16292
parent: e704b9e56b0cc862ebd5c95b9d023ab0a5ffdba7 [diff]
diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index 653224c..249f55f 100644
--- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4

@@ -838,9 +838,11 @@
     : ((WITH SINGLE PARTITION)
         | ((PARTITION | DISTRIBUTE) BY
             (((LEFT_PAREN partition+=expression (COMMA partition+=expression)* RIGHT_PAREN))
+            | (expression (COMMA invalidMultiPartitionExpression=expression)+)
             | partition+=expression)))
       ((ORDER | SORT) BY
         (((LEFT_PAREN sortItem (COMMA sortItem)* RIGHT_PAREN)
+        | (sortItem (COMMA invalidMultiSortItem=sortItem)+)
         | sortItem)))?
     ;
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 7d2355b..326f1e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

@@ -1638,6 +1638,20 @@
       }
       partitionByExpressions = p.partition.asScala.map(expression).toSeq
       orderByExpressions = p.sortItem.asScala.map(visitSortItem).toSeq
+      def invalidPartitionOrOrderingExpression(clause: String): String = {
+        "The table function call includes a table argument with an invalid " +
+          s"partitioning/ordering specification: the $clause clause included multiple " +
+          "expressions without parentheses surrounding them; please add parentheses around " +
+          "these expressions and then retry the query again"
+      }
+      validate(
+        Option(p.invalidMultiPartitionExpression).isEmpty,
+        message = invalidPartitionOrOrderingExpression("PARTITION BY"),
+        ctx = p.invalidMultiPartitionExpression)
+      validate(
+        Option(p.invalidMultiSortItem).isEmpty,
+        message = invalidPartitionOrOrderingExpression("ORDER BY"),
+        ctx = p.invalidMultiSortItem)
     }
     validate(
       !(withSinglePartition && partitionByExpressions.nonEmpty),

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 17dd734..8d01040 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala

@@ -1617,14 +1617,23 @@
           parameters = Map(
             "error" -> "'order'",
             "hint" -> ""))
-        val sql8 = s"select * from my_tvf(arg1 => table(select col1, col2, col3 from v2) " +
-          s"$partition by col1, col2 order by col2 asc, col3 desc)"
+        val sql8tableArg = "table(select col1, col2, col3 from v2)"
+        val sql8partition = s"$partition by col1, col2 order by col2 asc, col3 desc"
+        val sql8 = s"select * from my_tvf(arg1 => $sql8tableArg $sql8partition)"
         checkError(
           exception = parseException(sql8),
-          errorClass = "PARSE_SYNTAX_ERROR",
+          errorClass = "_LEGACY_ERROR_TEMP_0064",
           parameters = Map(
-            "error" -> "'order'",
-            "hint" -> ": extra input 'order'"))
+            "msg" ->
+              ("The table function call includes a table argument with an invalid " +
+              "partitioning/ordering specification: the PARTITION BY clause included multiple " +
+              "expressions without parentheses surrounding them; please add parentheses around " +
+              "these expressions and then retry the query again")),
+          context = ExpectedContext(
+            fragment = s"$sql8tableArg $sql8partition",
+            start = 29,
+            stop = 110 + partition.length)
+        )
       }
     }
   }

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
index 989597a..1eaf1d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala

@@ -20,6 +20,7 @@
 import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row}
 import org.apache.spark.sql.catalyst.expressions.{Add, Alias, Expression, FunctionTableSubqueryArgumentExpression, Literal}
+import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, OneRowRelation, Project, Repartition, RepartitionByExpression, Sort, SubqueryAlias}
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.internal.SQLConf
@@ -363,4 +364,29 @@
         Row("abc"))
     }
   }
+
+  test("SPARK-48180: Analyzer bug with multiple ORDER BY items for input table argument") {
+    assume(shouldTestPythonUDFs)
+    spark.udtf.registerPython("testUDTF", pythonUDTF)
+    checkError(
+      exception = intercept[ParseException](sql(
+        """
+          |SELECT * FROM testUDTF(
+          |  TABLE(SELECT 1 AS device_id, 2 AS data_ds)
+          |  WITH SINGLE PARTITION
+          |  ORDER BY device_id, data_ds)
+          |""".stripMargin)),
+      errorClass = "_LEGACY_ERROR_TEMP_0064",
+      parameters = Map("msg" ->
+        ("The table function call includes a table argument with an invalid " +
+          "partitioning/ordering specification: the ORDER BY clause included multiple " +
+          "expressions without parentheses surrounding them; please add parentheses around these " +
+          "expressions and then retry the query again")),
+      context = ExpectedContext(
+        fragment = "TABLE(SELECT 1 AS device_id, 2 AS data_ds)\n  " +
+          "WITH SINGLE PARTITION\n  " +
+          "ORDER BY device_id, data_ds",
+        start = 27,
+        stop = 122))
+  }
 }
commit	71f0eda71bc169a5245f4412ec0957728025a66c	[log] [tgz]
author	Daniel Tenedorio <daniel.tenedorio@databricks.com>	Fri May 10 08:44:07 2024 +0900
committer	Hyukjin Kwon <gurwls223@apache.org>	Fri May 10 08:44:07 2024 +0900
tree	41dc5e10e8f05f39dffe7d992121312aa6f16292
parent	e704b9e56b0cc862ebd5c95b9d023ab0a5ffdba7 [diff]