blob: 93ee8d8852f9ba90d91f285062b5cd2acfc53a3a [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
- case: sampling
main: |
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.range(1)
df.sample(1.0)
df.sample(0.5, 3)
df.sample(fraction=0.5, seed=3)
df.sample(withReplacement=True, fraction=0.5, seed=3)
df.sample(fraction=1.0)
df.sample(False, fraction=1.0)
# Will raise a runtime error, though not a typecheck error, as bool is
# duck-type compatible with float.
df.sample(True)
df.sample(withReplacement=False)
out: |
main:16: error: No overload variant of "sample" of "DataFrame" matches argument type "bool" [call-overload]
main:16: note: Possible overload variants:
main:16: note: def sample(self, fraction: float, seed: int | None = ...) -> DataFrame
main:16: note: def sample(self, withReplacement: bool | None, fraction: float, seed: int | None = ...) -> DataFrame
- case: selectColumns
main: |
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
data = [('Alice', 1)]
df = spark.createDataFrame(data, schema="name str, age int")
df.select(["name", "age"])
df.select([col("name"), col("age")])
df.select(["name", col("age")]) # E: Argument 1 to "select" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str]" [arg-type]
- case: groupBy
main: |
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
data = [('Alice', 1)]
df = spark.createDataFrame(data, schema="name str, age int")
df.groupBy(["name", "age"])
df.groupby(["name", "age"])
df.groupBy([col("name"), col("age")])
df.groupby([col("name"), col("age")])
df.groupBy(["name", col("age")]) # E: Argument 1 to "groupBy" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str] | list[int]" [arg-type]
- case: rollup
main: |
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
data = [('Alice', 1)]
df = spark.createDataFrame(data, schema="name str, age int")
df.rollup(["name", "age"])
df.rollup([col("name"), col("age")])
df.rollup(["name", col("age")]) # E: Argument 1 to "rollup" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str]" [arg-type]
- case: cube
main: |
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.getOrCreate()
data = [('Alice', 1)]
df = spark.createDataFrame(data, schema="name str, age int")
df.cube(["name", "age"])
df.cube([col("name"), col("age")])
df.cube(["name", col("age")]) # E: Argument 1 to "cube" of "DataFrame" has incompatible type "list[object]"; expected "list[Column] | list[str]" [arg-type]
- case: dropColumns
main: |
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.getOrCreate()
df = spark.range(1)
df.drop("id")
df.drop("id", "foo")
df.drop(df.id)
df.drop(col("id"), col("foo"))
out: |
main:10: error: No overload variant of "drop" of "DataFrame" matches argument types "Column", "Column" [call-overload]
main:10: note: Possible overload variants:
main:10: note: def drop(self, cols: Column | str) -> DataFrame
main:10: note: def drop(self, *cols: str) -> DataFrame
- case: fillNullValues
main: |
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([(1,2)], schema=("id1", "id2"))
df.fillna(value=1, subset="id1")
df.fillna(value=1, subset=("id1", "id2"))
df.fillna(value=1, subset=["id1"])