python/pyspark/sql/tests/typing/test_session.yml - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 - case: createDataFrameStructsValid
   main: |
     from pyspark.sql import SparkSession
     from pyspark.sql.types import StructType, StructField, StringType, IntegerType

     spark = SparkSession.builder.getOrCreate()

     data = [('Alice', 1)]
     schema = StructType([
         StructField("name", StringType(), True),
         StructField("age", IntegerType(), True)
     ])

     # Valid structs
     spark.createDataFrame(data)
     spark.createDataFrame(data, samplingRatio=0.1)
     spark.createDataFrame(data, ("name", "age"))
     spark.createDataFrame(data, schema)
     spark.createDataFrame(data, "name string, age integer")
     spark.createDataFrame([(1, ("foo", "bar"))], ("_1", "_2"))
     spark.createDataFrame(data, ("name", "age"), samplingRatio=0.1)


 - case: createDataFrameScalarsValid
   main: |

     from pyspark.sql import SparkSession
     from pyspark.sql.types import StructType, StructField, StringType, IntegerType

     spark = SparkSession.builder.getOrCreate()

     # Scalars
     spark.createDataFrame([1, 2, 3], IntegerType())
     spark.createDataFrame(["foo", "bar"], "string")


 - case: createDataFrameStructsInvalid
   main: |
     from pyspark.sql import SparkSession
     from pyspark.sql.types import StructType, StructField, StringType, IntegerType

     spark = SparkSession.builder.getOrCreate()

     data = [('Alice', 1)]

     schema = StructType([
         StructField("name", StringType(), True),
         StructField("age", IntegerType(), True)
     ])

     # Invalid product should have StructType schema
     spark.createDataFrame(data, IntegerType())

     # This shouldn't type check, though is technically speaking valid
     # because  samplingRatio is ignored
     spark.createDataFrame(data, schema, samplingRatio=0.1)

   out: |
     main:14: error: Value of type variable "AtomicValue" of "createDataFrame" of "SparkSession" cannot be "tuple[str, int]"  [type-var]
     main:18: error: No overload variant of "createDataFrame" of "SparkSession" matches argument types "list[tuple[str, int]]", "StructType", "float"  [call-overload]
     main:18: note: Possible overload variants:
     main:18: note:     def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: list[str] | tuple[str, ...] = ..., samplingRatio: float | None = ...) -> DataFrame
     main:18: note:     def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: list[str] | tuple[str, ...] = ..., samplingRatio: float | None = ...) -> DataFrame
     main:18: note:     def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: StructType | str, *, verifySchema: bool = ...) -> DataFrame
     main:18: note:     def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: StructType | str, *, verifySchema: bool = ...) -> DataFrame
     main:18: note:     def [AtomicValue: (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: RDD[AtomicValue], schema: AtomicType | str, verifySchema: bool = ...) -> DataFrame
     main:18: note:     def [AtomicValue: (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: Iterable[AtomicValue], schema: AtomicType | str, verifySchema: bool = ...) -> DataFrame
     main:18: note:     def createDataFrame(self, data: DataFrame, samplingRatio: float | None = ...) -> DataFrame
     main:18: note:     def createDataFrame(self, data: Any, samplingRatio: float | None = ...) -> DataFrame
     main:18: note:     def createDataFrame(self, data: DataFrame, schema: StructType | str, verifySchema: bool = ...) -> DataFrame
     main:18: note:     def createDataFrame(self, data: Any, schema: StructType | str, verifySchema: bool = ...) -> DataFrame

 - case: createDataFrameFromEmptyRdd
   main: |
     from pyspark.sql import SparkSession
     from pyspark.sql.types import StructType

     spark = SparkSession.builder.getOrCreate()

     spark.createDataFrame(
         spark.sparkContext.emptyRDD(),
         schema=StructType(),
     )
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	- case: createDataFrameStructsValid
	main: \|
	from pyspark.sql import SparkSession
	from pyspark.sql.types import StructType, StructField, StringType, IntegerType

	spark = SparkSession.builder.getOrCreate()

	data = [('Alice', 1)]
	schema = StructType([
	StructField("name", StringType(), True),
	StructField("age", IntegerType(), True)
	])

	# Valid structs
	spark.createDataFrame(data)
	spark.createDataFrame(data, samplingRatio=0.1)
	spark.createDataFrame(data, ("name", "age"))
	spark.createDataFrame(data, schema)
	spark.createDataFrame(data, "name string, age integer")
	spark.createDataFrame([(1, ("foo", "bar"))], ("_1", "_2"))
	spark.createDataFrame(data, ("name", "age"), samplingRatio=0.1)


	- case: createDataFrameScalarsValid
	main: \|

	from pyspark.sql import SparkSession
	from pyspark.sql.types import StructType, StructField, StringType, IntegerType

	spark = SparkSession.builder.getOrCreate()

	# Scalars
	spark.createDataFrame([1, 2, 3], IntegerType())
	spark.createDataFrame(["foo", "bar"], "string")


	- case: createDataFrameStructsInvalid
	main: \|
	from pyspark.sql import SparkSession
	from pyspark.sql.types import StructType, StructField, StringType, IntegerType

	spark = SparkSession.builder.getOrCreate()

	data = [('Alice', 1)]

	schema = StructType([
	StructField("name", StringType(), True),
	StructField("age", IntegerType(), True)
	])

	# Invalid product should have StructType schema
	spark.createDataFrame(data, IntegerType())

	# This shouldn't type check, though is technically speaking valid
	# because samplingRatio is ignored
	spark.createDataFrame(data, schema, samplingRatio=0.1)

	out: \|
	main:14: error: Value of type variable "AtomicValue" of "createDataFrame" of "SparkSession" cannot be "tuple[str, int]" [type-var]
	main:18: error: No overload variant of "createDataFrame" of "SparkSession" matches argument types "list[tuple[str, int]]", "StructType", "float" [call-overload]
	main:18: note: Possible overload variants:
	main:18: note: def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: list[str] \| tuple[str, ...] = ..., samplingRatio: float \| None = ...) -> DataFrame
	main:18: note: def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: list[str] \| tuple[str, ...] = ..., samplingRatio: float \| None = ...) -> DataFrame
	main:18: note: def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: Iterable[RowLike], schema: StructType \| str, *, verifySchema: bool = ...) -> DataFrame
	main:18: note: def [RowLike: (list[Any], tuple[Any, ...], Row)] createDataFrame(self, data: RDD[RowLike], schema: StructType \| str, *, verifySchema: bool = ...) -> DataFrame
	main:18: note: def [AtomicValue: (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: RDD[AtomicValue], schema: AtomicType \| str, verifySchema: bool = ...) -> DataFrame
	main:18: note: def [AtomicValue: (datetime, date, Decimal, bool, str, int, float)] createDataFrame(self, data: Iterable[AtomicValue], schema: AtomicType \| str, verifySchema: bool = ...) -> DataFrame
	main:18: note: def createDataFrame(self, data: DataFrame, samplingRatio: float \| None = ...) -> DataFrame
	main:18: note: def createDataFrame(self, data: Any, samplingRatio: float \| None = ...) -> DataFrame
	main:18: note: def createDataFrame(self, data: DataFrame, schema: StructType \| str, verifySchema: bool = ...) -> DataFrame
	main:18: note: def createDataFrame(self, data: Any, schema: StructType \| str, verifySchema: bool = ...) -> DataFrame

	- case: createDataFrameFromEmptyRdd
	main: \|
	from pyspark.sql import SparkSession
	from pyspark.sql.types import StructType

	spark = SparkSession.builder.getOrCreate()

	spark.createDataFrame(
	spark.sparkContext.emptyRDD(),
	schema=StructType(),
	)