[SPARK-48060][SS][TESTS] Fix `StreamingQueryHashPartitionVerifySuite` to update golden files correctly
### What changes were proposed in this pull request?
This PR aims to fix `StreamingQueryHashPartitionVerifySuite` to update golden files correctly.
- The documentation is added.
- Newly generated files are updated.
### Why are the changes needed?
Previously, `SPARK_GENERATE_GOLDEN_FILES` doesn't work as expected because it updates the files under `target` directory. We need to update `src/test` files.
**BEFORE**
```
$ SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite"
$ git status
On branch master
Your branch is up to date with 'apache/master'.
nothing to commit, working tree clean
```
**AFTER**
```
$ SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" \
-Dspark.sql.test.randomDataGenerator.maxStrLen=100 \
-Dspark.sql.test.randomDataGenerator.maxArraySize=4
$ git status
On branch SPARK-48060
Your branch is up to date with 'dongjoon/SPARK-48060'.
Changes not staged for commit:
(use "git add <file>..." to update what will be committed)
(use "git restore <file>..." to discard changes in working directory)
modified: sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas
modified: sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
no changes added to commit (use "git add" and/or "git commit -a")
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass the CIs. I regenerate the data like the following.
```
$ SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite" \
-Dspark.sql.test.randomDataGenerator.maxStrLen=100 \
-Dspark.sql.test.randomDataGenerator.maxArraySize=4
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #46304 from dongjoon-hyun/SPARK-48060.
Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
diff --git a/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas b/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas
index 8d6ff94..f6eadd77 100644
--- a/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas
+++ b/sql/core/src/test/resources/structured-streaming/partition-tests/randomSchemas
@@ -1 +1 @@
-col_0 STRUCT<col_0: BINARY, col_1: BIGINT NOT NULL, col_2: ARRAY<DOUBLE> NOT NULL, col_3: FLOAT NOT NULL, col_4: INT NOT NULL>,col_1 STRUCT<col_0: STRING, col_1: TIMESTAMP NOT NULL, col_2: STRUCT<col_0: FLOAT NOT NULL>, col_3: ARRAY<INT> NOT NULL, col_4: ARRAY<BINARY>, col_5: TIMESTAMP NOT NULL, col_6: STRUCT<col_0: ARRAY<DOUBLE>, col_1: BIGINT NOT NULL> NOT NULL, col_7: ARRAY<INT> NOT NULL, col_8: ARRAY<BIGINT>, col_9: BIGINT NOT NULL> NOT NULL,col_2 BIGINT NOT NULL,col_3 STRUCT<col_0: BINARY> NOT NULL,col_4 STRUCT<col_0: STRUCT<col_0: ARRAY<FLOAT> NOT NULL> NOT NULL> NOT NULL,col_5 ARRAY<INT> NOT NULL
+col_0 ARRAY<BINARY>,col_1 STRUCT<col_0: STRING> NOT NULL,col_2 STRING NOT NULL,col_3 STRUCT<col_0: INT, col_1: ARRAY<STRING>, col_2: ARRAY<DOUBLE> NOT NULL> NOT NULL,col_4 BINARY NOT NULL,col_5 ARRAY<BINARY> NOT NULL,col_6 ARRAY<FLOAT>,col_7 DOUBLE NOT NULL,col_8 ARRAY<DOUBLE> NOT NULL,col_9 ARRAY<TIMESTAMP>,col_10 FLOAT NOT NULL,col_11 STRUCT<col_0: STRUCT<col_0: ARRAY<TIMESTAMP> NOT NULL>, col_1: STRUCT<col_0: ARRAY<STRING> NOT NULL, col_1: INT, col_2: STRUCT<col_0: STRUCT<col_0: STRING>> NOT NULL>, col_2: BINARY NOT NULL, col_3: STRUCT<col_0: ARRAY<TIMESTAMP> NOT NULL> NOT NULL> NOT NULL,col_12 ARRAY<STRING>
diff --git a/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds b/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
index 3902d6d..1b2eda8 100644
--- a/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
+++ b/sql/core/src/test/resources/structured-streaming/partition-tests/rowsAndPartIds
Binary files differ
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala
index 3423b8b..3d8c20a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryHashPartitionVerifySuite.scala
@@ -31,11 +31,24 @@
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.types.{BinaryType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType, StructType, TimestampType}
+/**
+ * To run the test suite:
+ * {{{
+ * build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite"
+ * }}}
+ *
+ * To re-generate the golden file with size limit under 10Mb, run:
+ * {{{
+ * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *StreamingQueryHashPartitionVerifySuite"
+ * -Dspark.sql.test.randomDataGenerator.maxStrLen=100
+ * -Dspark.sql.test.randomDataGenerator.maxArraySize=4
+ * }}}
+ */
class StreamingQueryHashPartitionVerifySuite extends StreamTest {
- // Configs for golden file
- private val goldenFileURI =
- this.getClass.getResource("/structured-streaming/partition-tests/").toURI
+ // A golden file directory in `src/test` instead of `target` directory.
+ private val goldenFileURI = getWorkspaceFilePath(
+ "sql", "core", "src", "test", "resources", "structured-streaming", "partition-tests").toUri
private val schemaFileName = "randomSchemas" // files for storing random input schemas
private val rowAndPartIdFilename =
@@ -152,9 +165,6 @@
val rowAndPartIdFile = new File(goldenFileURI.getPath, rowAndPartIdFilename)
if (regenerateGoldenFiles) {
- // To limit the golden file size under 10Mb, please set the final val MAX_STR_LEN: Int = 100
- // and final val MAX_ARR_SIZE: Int = 4 in org.apache.spark.sql.RandomDataGenerator
-
val random = new Random()
val schemas = getRandomSchemas(random)