[SPARK-48060][SS][TESTS] Fix `StreamingQueryHashPartitionVerifySuite` to update golden files correctly ### What changes were proposed in this pull request? This PR aims to fix `StreamingQueryHashPartitionVerifySuite` to update golden files correctly. - The documentation is added. - Newly generated files are updated. ### Why are the changes needed? Previously, `SPARK_GENERATE_GOLDEN_FILES` doesn't work as expected because it updates the files under `target` directory. We need to update `src/test` files. **BEFORE** ``` $ SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" $ git status On branch master Your branch is up to date with 'apache/master'. nothing to commit, working tree clean ``` **AFTER** ``` $ SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" \ -Dspark.sql.test.randomDataGenerator.maxStrLen=100 \ -Dspark.sql.test.randomDataGenerator.maxArraySize=4 $ git status On branch SPARK-48060 Your branch is up to date with 'dongjoon/SPARK-48060'. Changes not staged for commit: (use "git add <file>..." to update what will be committed) (use "git restore <file>..." to discard changes in working directory) modified: sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas modified: sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds no changes added to commit (use "git add" and/or "git commit -a") ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. I regenerate the data like the following. ``` $ SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" \ -Dspark.sql.test.randomDataGenerator.maxStrLen=100 \ -Dspark.sql.test.randomDataGenerator.maxArraySize=4 ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46304 from dongjoon-hyun/SPARK-48060. Authored-by: Dongjoon Hyun <dhyun@apple.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>

commit: faab553cac70eefeec286b1823b70ad62bed87f8 [log] [tgz]
author: Dongjoon Hyun <dhyun@apple.com> Tue Apr 30 12:50:07 2024 -0700
committer: Dongjoon Hyun <dhyun@apple.com> Tue Apr 30 12:50:07 2024 -0700
tree: 69bf1e4f7d49b4ea54b57c90bd59d02b4e496efc
parent: dab20b31388ba7bcd2ab4d4424cbbd072bf84c30 [diff]
diff --git a/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas b/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas
index 8d6ff94..f6eadd77 100644
--- a/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas
+++ b/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas

@@ -1 +1 @@
-col_0 STRUCT<col_0: BINARY, col_1: BIGINT NOT NULL, col_2: ARRAY<DOUBLE> NOT NULL, col_3: FLOAT NOT NULL, col_4: INT NOT NULL>,col_1 STRUCT<col_0: STRING, col_1: TIMESTAMP NOT NULL, col_2: STRUCT<col_0: FLOAT NOT NULL>, col_3: ARRAY<INT> NOT NULL, col_4: ARRAY<BINARY>, col_5: TIMESTAMP NOT NULL, col_6: STRUCT<col_0: ARRAY<DOUBLE>, col_1: BIGINT NOT NULL> NOT NULL, col_7: ARRAY<INT> NOT NULL, col_8: ARRAY<BIGINT>, col_9: BIGINT NOT NULL> NOT NULL,col_2 BIGINT NOT NULL,col_3 STRUCT<col_0: BINARY> NOT NULL,col_4 STRUCT<col_0: STRUCT<col_0: ARRAY<FLOAT> NOT NULL> NOT NULL> NOT NULL,col_5 ARRAY<INT> NOT NULL
+col_0 ARRAY<BINARY>,col_1 STRUCT<col_0: STRING> NOT NULL,col_2 STRING NOT NULL,col_3 STRUCT<col_0: INT, col_1: ARRAY<STRING>, col_2: ARRAY<DOUBLE> NOT NULL> NOT NULL,col_4 BINARY NOT NULL,col_5 ARRAY<BINARY> NOT NULL,col_6 ARRAY<FLOAT>,col_7 DOUBLE NOT NULL,col_8 ARRAY<DOUBLE> NOT NULL,col_9 ARRAY<TIMESTAMP>,col_10 FLOAT NOT NULL,col_11 STRUCT<col_0: STRUCT<col_0: ARRAY<TIMESTAMP> NOT NULL>, col_1: STRUCT<col_0: ARRAY<STRING> NOT NULL, col_1: INT, col_2: STRUCT<col_0: STRUCT<col_0: STRING>> NOT NULL>, col_2: BINARY NOT NULL, col_3: STRUCT<col_0: ARRAY<TIMESTAMP> NOT NULL> NOT NULL> NOT NULL,col_12 ARRAY<STRING>

diff --git a/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds b/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
index 3902d6d..1b2eda8 100644
--- a/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
+++ b/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
Binary files differ

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala
index 3423b8b..3d8c20a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala

@@ -31,11 +31,24 @@
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.types.{BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType, StructType, TimestampType}
 
+/**
+ * To run the test suite:
+ * {{{
+ *   build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite"
+ * }}}
+ *
+ * To re-generate the golden file with size limit under 10Mb, run:
+ * {{{
+ *   SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite"
+ *     -Dspark.sql.test.randomDataGenerator.maxStrLen=100
+ *     -Dspark.sql.test.randomDataGenerator.maxArraySize=4
+ * }}}
+ */
 class StreamingQueryHashPartitionVerifySuite extends StreamTest {
 
-  // Configs for golden file
-  private val goldenFileURI =
-    this.getClass.getResource("/structured-streaming/partition-tests/").toURI
+  // A golden file directory in `src/test` instead of `target` directory.
+  private val goldenFileURI = getWorkspaceFilePath(
+    "sql", "core", "src", "test", "resources", "structured-streaming", "partition-tests").toUri
 
   private val schemaFileName = "randomSchemas" // files for storing random input schemas
   private val rowAndPartIdFilename =
@@ -152,9 +165,6 @@
     val rowAndPartIdFile = new File(goldenFileURI.getPath, rowAndPartIdFilename)
 
     if (regenerateGoldenFiles) {
-      // To limit the golden file size under 10Mb, please set the final val MAX_STR_LEN: Int = 100
-      // and final val MAX_ARR_SIZE: Int = 4 in org.apache.spark.sql.RandomDataGenerator
-
       val random = new Random()
 
       val schemas = getRandomSchemas(random)
commit	faab553cac70eefeec286b1823b70ad62bed87f8	[log] [tgz]
author	Dongjoon Hyun <dhyun@apple.com>	Tue Apr 30 12:50:07 2024 -0700
committer	Dongjoon Hyun <dhyun@apple.com>	Tue Apr 30 12:50:07 2024 -0700
tree	69bf1e4f7d49b4ea54b57c90bd59d02b4e496efc
parent	dab20b31388ba7bcd2ab4d4424cbbd072bf84c30 [diff]