python/pyspark/sql/column.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 # mypy: disable-error-code="empty-body"

 import sys
 from typing import (
     overload,
     Any,
     TYPE_CHECKING,
     Union,
 )

 from pyspark.sql.utils import dispatch_col_method
 from pyspark.sql.types import DataType
 from pyspark.errors import PySparkValueError

 if TYPE_CHECKING:
     from py4j.java_gateway import JavaObject
     from pyspark.sql._typing import LiteralType, DecimalLiteral, DateTimeLiteral
     from pyspark.sql.window import WindowSpec

 __all__ = ["Column"]


 class Column:

     """
     A column in a DataFrame.

     .. versionadded:: 1.3.0

     .. versionchanged:: 3.4.0
         Supports Spark Connect.

     Examples
     --------
     Column instances can be created by

     >>> df = spark.createDataFrame(
     ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])

     Select a column out of a DataFrame
     >>> df.name
     Column<'name'>
     >>> df["name"]
     Column<'name'>

     Create from an expression

     >>> df.age + 1
     Column<...>
     >>> 1 / df.age
     Column<...>
     """

     # HACK ALERT!! this is to reduce the backward compatibility concern, and returns
     # Spark Classic Column by default. This is NOT an API, and NOT supposed to
     # be directly invoked. DO NOT use this constructor.
     def __new__(
         cls,
         jc: "JavaObject",
     ) -> "Column":
         from pyspark.sql.classic.column import Column

         return Column.__new__(Column, jc)

     def __init__(self, jc: "JavaObject") -> None:
         self._jc = jc

     # arithmetic operators
     @dispatch_col_method
     def __neg__(self) -> "Column":
         ...

     @dispatch_col_method
     def __add__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __sub__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __mul__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __div__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __truediv__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __mod__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __radd__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __rsub__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __rmul__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __rdiv__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __rtruediv__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __rmod__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __pow__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __rpow__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     # logistic operators
     @dispatch_col_method
     def __eq__(  # type: ignore[override]
         self,
         other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"],
     ) -> "Column":
         """binary function"""
         ...

     @dispatch_col_method
     def __ne__(  # type: ignore[override]
         self,
         other: Any,
     ) -> "Column":
         """binary function"""
         ...

     @dispatch_col_method
     def __lt__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __le__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __ge__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __gt__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def eqNullSafe(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """
         Equality test that is safe for null values.

         .. versionadded:: 2.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other
             a value or :class:`Column`

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df1 = spark.createDataFrame([
         ...     Row(id=1, value='foo'),
         ...     Row(id=2, value=None)
         ... ])
         >>> df1.select(
         ...     df1['value'] == 'foo',
         ...     df1['value'].eqNullSafe('foo'),
         ...     df1['value'].eqNullSafe(None)
         ... ).show()
         +-------------+---------------+----------------+
         |(value = foo)|(value <=> foo)|(value <=> NULL)|
         +-------------+---------------+----------------+
         |         true|           true|           false|
         |         NULL|          false|            true|
         +-------------+---------------+----------------+
         >>> df2 = spark.createDataFrame([
         ...     Row(value = 'bar'),
         ...     Row(value = None)
         ... ])
         >>> df1.join(df2, df1["value"] == df2["value"]).count()
         0
         >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()
         1
         >>> df2 = spark.createDataFrame([
         ...     Row(id=1, value=float('NaN')),
         ...     Row(id=2, value=42.0),
         ...     Row(id=3, value=None)
         ... ])
         >>> df2.select(
         ...     df2['value'].eqNullSafe(None),
         ...     df2['value'].eqNullSafe(float('NaN')),
         ...     df2['value'].eqNullSafe(42.0)
         ... ).show()
         +----------------+---------------+----------------+
         |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
         +----------------+---------------+----------------+
         |           false|           true|           false|
         |           false|          false|            true|
         |            true|          false|           false|
         +----------------+---------------+----------------+

         Notes
         -----
         Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the
         `NaN Semantics <https://spark.apache.org/docs/latest/sql-ref-datatypes.html#nan-semantics>`_
         for details.
         """
         ...

     # `and`, `or`, `not` cannot be overloaded in Python,
     # so use bitwise operators as boolean operators
     @dispatch_col_method
     def __and__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __or__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __invert__(self) -> "Column":
         ...

     @dispatch_col_method
     def __rand__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     @dispatch_col_method
     def __ror__(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         ...

     # container operators
     @dispatch_col_method
     def __contains__(self, item: Any) -> None:
         raise PySparkValueError(
             error_class="CANNOT_APPLY_IN_FOR_COLUMN",
             message_parameters={},
         )

     # bitwise operators
     @dispatch_col_method
     def bitwiseOR(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """ "
         Compute bitwise OR of this expression with another expression.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other
             a value or :class:`Column` to calculate bitwise or(|) with
             this :class:`Column`.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([Row(a=170, b=75)])
         >>> df.select(df.a.bitwiseOR(df.b)).collect()
         [Row((a | b)=235)]
         """
         ...

     @dispatch_col_method
     def bitwiseAND(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """
         Compute bitwise AND of this expression with another expression.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other
             a value or :class:`Column` to calculate bitwise and(&) with
             this :class:`Column`.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([Row(a=170, b=75)])
         >>> df.select(df.a.bitwiseAND(df.b)).collect()
         [Row((a & b)=10)]
         """
         ...

     @dispatch_col_method
     def bitwiseXOR(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """
         Compute bitwise XOR of this expression with another expression.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other
             a value or :class:`Column` to calculate bitwise xor(^) with
             this :class:`Column`.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([Row(a=170, b=75)])
         >>> df.select(df.a.bitwiseXOR(df.b)).collect()
         [Row((a ^ b)=225)]
         """
         ...

     @dispatch_col_method
     def getItem(self, key: Any) -> "Column":
         """
         An expression that gets an item at position ``ordinal`` out of a list,
         or gets an item by key out of a dict.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         key
             a literal value, or a :class:`Column` expression.
             The result will only be true at a location if the item matches in the column.

              .. deprecated:: 3.0.0
                  :class:`Column` as a parameter is deprecated.

         Returns
         -------
         :class:`Column`
             Column representing the item(s) got at position out of a list or by key out of a dict.

         Examples
         --------
         >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])
         >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
         +----+------+
         |l[0]|d[key]|
         +----+------+
         |   1| value|
         +----+------+
         """
         ...

     @dispatch_col_method
     def getField(self, name: Any) -> "Column":
         """
         An expression that gets a field by name in a :class:`StructType`.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         name
             a literal value, or a :class:`Column` expression.
             The result will only be true at a location if the field matches in the Column.

              .. deprecated:: 3.0.0
                  :class:`Column` as a parameter is deprecated.
         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column got by name.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))])
         >>> df.select(df.r.getField("b")).show()
         +---+
         |r.b|
         +---+
         |  b|
         +---+
         >>> df.select(df.r.a).show()
         +---+
         |r.a|
         +---+
         |  1|
         +---+
         """
         ...

     @dispatch_col_method
     def withField(self, fieldName: str, col: "Column") -> "Column":
         """
         An expression that adds/replaces a field in :class:`StructType` by name.

         .. versionadded:: 3.1.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         fieldName : str
             a literal value.
             The result will only be true at a location if any field matches in the Column.
         col : :class:`Column`
             A :class:`Column` expression for the column with `fieldName`.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column
             which field was added/replaced by fieldName.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> from pyspark.sql.functions import lit
         >>> df = spark.createDataFrame([Row(a=Row(b=1, c=2))])
         >>> df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show()
         +---+
         |  b|
         +---+
         |  3|
         +---+
         >>> df.withColumn('a', df['a'].withField('d', lit(4))).select('a.d').show()
         +---+
         |  d|
         +---+
         |  4|
         +---+
         """
         ...

     @dispatch_col_method
     def dropFields(self, *fieldNames: str) -> "Column":
         """
         An expression that drops fields in :class:`StructType` by name.
         This is a no-op if the schema doesn't contain field name(s).

         .. versionadded:: 3.1.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         fieldNames : str
             Desired field names (collects all positional arguments passed)
             The result will drop at a location if any field matches in the Column.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column with field dropped by fieldName.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> from pyspark.sql.functions import col, lit
         >>> df = spark.createDataFrame([
         ...     Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])
         >>> df.withColumn('a', df['a'].dropFields('b')).show()
         +-----------------+
         |                a|
         +-----------------+
         |{2, 3, {4, 5, 6}}|
         +-----------------+

         >>> df.withColumn('a', df['a'].dropFields('b', 'c')).show()
         +--------------+
         |             a|
         +--------------+
         |{3, {4, 5, 6}}|
         +--------------+

         This method supports dropping multiple nested fields directly e.g.

         >>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show()
         +--------------+
         |             a|
         +--------------+
         |{1, 2, 3, {4}}|
         +--------------+

         However, if you are going to add/replace multiple nested fields,
         it is preferred to extract out the nested struct before
         adding/replacing multiple fields e.g.

         >>> df.select(col("a").withField(
         ...     "e", col("a.e").dropFields("g", "h")).alias("a")
         ... ).show()
         +--------------+
         |             a|
         +--------------+
         |{1, 2, 3, {4}}|
         +--------------+

         """
         ...

     @dispatch_col_method
     def __getattr__(self, item: Any) -> "Column":
         """
         An expression that gets an item at position ``ordinal`` out of a list,
         or gets an item by key out of a dict.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         item
             a literal value.

         Returns
         -------
         :class:`Column`
             Column representing the item got by key out of a dict.

         Examples
         --------
         >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
         >>> df.select(df.d.key).show()
         +------+
         |d[key]|
         +------+
         | value|
         +------+
         """
         ...

     @dispatch_col_method
     def __getitem__(self, k: Any) -> "Column":
         """
         An expression that gets an item at position ``ordinal`` out of a list,
         or gets an item by key out of a dict.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         k
             a literal value, or a slice object without step.

         Returns
         -------
         :class:`Column`
             Column representing the item got by key out of a dict, or substrings sliced by
             the given slice object.

         Examples
         --------
         >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
         >>> df.select(df.l[slice(1, 3)], df.d['key']).show()
         +---------------+------+
         |substr(l, 1, 3)|d[key]|
         +---------------+------+
         |            abc| value|
         +---------------+------+
         """
         ...

     @dispatch_col_method
     def __iter__(self) -> None:
         ...

     # string methods
     @dispatch_col_method
     def contains(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """
         Contains the other element. Returns a boolean :class:`Column` based on a string match.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other
             string in line. A value as a literal or a :class:`Column`.

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.contains('o')).collect()
         [Row(age=5, name='Bob')]
         """
         ...

     @dispatch_col_method
     def startswith(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """
         String starts with. Returns a boolean :class:`Column` based on a string match.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other : :class:`Column` or str
             string at start of line (do not use a regex `^`)

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.startswith('Al')).collect()
         [Row(age=2, name='Alice')]
         >>> df.filter(df.name.startswith('^Al')).collect()
         []
         """
         ...

     @dispatch_col_method
     def endswith(
         self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
     ) -> "Column":
         """
         String ends with. Returns a boolean :class:`Column` based on a string match.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other : :class:`Column` or str
             string at end of line (do not use a regex `$`)

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.endswith('ice')).collect()
         [Row(age=2, name='Alice')]
         >>> df.filter(df.name.endswith('ice$')).collect()
         []
         """
         ...

     @dispatch_col_method
     def like(self: "Column", other: str) -> "Column":
         """
         SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other : str
             a SQL LIKE pattern

         See Also
         --------
         pyspark.sql.Column.rlike

         Returns
         -------
         :class:`Column`
             Column of booleans showing whether each element
             in the Column is matched by SQL LIKE pattern.

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.like('Al%')).collect()
         [Row(age=2, name='Alice')]
         """
         ...

     @dispatch_col_method
     def rlike(self: "Column", other: str) -> "Column":
         """
         SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex
         match.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other : str
             an extended regex expression

         Returns
         -------
         :class:`Column`
             Column of booleans showing whether each element
             in the Column is matched by extended regex expression.

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.rlike('ice$')).collect()
         [Row(age=2, name='Alice')]
         """
         ...

     @dispatch_col_method
     def ilike(self: "Column", other: str) -> "Column":
         """
         SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column`
         based on a case insensitive match.

         .. versionadded:: 3.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         other : str
             a SQL LIKE pattern

         See Also
         --------
         pyspark.sql.Column.rlike

         Returns
         -------
         :class:`Column`
             Column of booleans showing whether each element
             in the Column is matched by SQL LIKE pattern.

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.filter(df.name.ilike('%Ice')).collect()
         [Row(age=2, name='Alice')]
         """
         ...

     @overload
     def substr(self, startPos: int, length: int) -> "Column":
         ...

     @overload
     def substr(self, startPos: "Column", length: "Column") -> "Column":
         ...

     @dispatch_col_method
     def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) -> "Column":
         """
         Return a :class:`Column` which is a substring of the column.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         startPos : :class:`Column` or int
             start position
         length : :class:`Column` or int
             length of the substring

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column is substr of origin Column.

         Examples
         --------

         Example 1. Using integers for the input arguments.

         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name.substr(1, 3).alias("col")).collect()
         [Row(col='Ali'), Row(col='Bob')]

         Example 2. Using columns for the input arguments.

         >>> df = spark.createDataFrame(
         ...      [(3, 4, "Alice"), (2, 3, "Bob")], ["sidx", "eidx", "name"])
         >>> df.select(df.name.substr(df.sidx, df.eidx).alias("col")).collect()
         [Row(col='ice'), Row(col='ob')]
         """
         ...

     @dispatch_col_method
     def isin(self, *cols: Any) -> "Column":
         """
         A boolean expression that is evaluated to true if the value of this
         expression is contained by the evaluated values of the arguments.

         .. versionadded:: 1.5.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         cols : Any
             The values to compare with the column values. The result will only be true at a location
             if any value matches in the Column.

         Returns
         -------
         :class:`Column`
             Column of booleans showing whether each element in the Column is contained in cols.

         Examples
         --------
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob"), (8, "Mike")], ["age", "name"])

         Example 1: Filter rows with names in the specified values

         >>> df[df.name.isin("Bob", "Mike")].show()
         +---+----+
         |age|name|
         +---+----+
         |  5| Bob|
         |  8|Mike|
         +---+----+

         Example 2: Filter rows with ages in the specified list

         >>> df[df.age.isin([1, 2, 3])].show()
         +---+-----+
         |age| name|
         +---+-----+
         |  2|Alice|
         +---+-----+

         Example 3: Filter rows with names not in the specified values

         >>> df[~df.name.isin("Alice", "Bob")].show()
         +---+----+
         |age|name|
         +---+----+
         |  8|Mike|
         +---+----+
         """
         ...

     # order
     @dispatch_col_method
     def asc(self) -> "Column":
         """
         Returns a sort expression based on the ascending order of the column.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
         >>> df.select(df.name).orderBy(df.name.asc()).collect()
         [Row(name='Alice'), Row(name='Tom')]
         """
         ...

     @dispatch_col_method
     def asc_nulls_first(self) -> "Column":
         """
         Returns a sort expression based on ascending order of the column, and null values
         return before non-null values.

         .. versionadded:: 2.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame(
         ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
         >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
         [Row(name=None), Row(name='Alice'), Row(name='Tom')]

         """
         ...

     @dispatch_col_method
     def asc_nulls_last(self) -> "Column":
         """
         Returns a sort expression based on ascending order of the column, and null values
         appear after non-null values.

         .. versionadded:: 2.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame(
         ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
         >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
         [Row(name='Alice'), Row(name='Tom'), Row(name=None)]

         """
         ...

     @dispatch_col_method
     def desc(self) -> "Column":
         """
         Returns a sort expression based on the descending order of the column.

         .. versionadded:: 2.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
         >>> df.select(df.name).orderBy(df.name.desc()).collect()
         [Row(name='Tom'), Row(name='Alice')]
         """
         ...

     @dispatch_col_method
     def desc_nulls_first(self) -> "Column":
         """
         Returns a sort expression based on the descending order of the column, and null values
         appear before non-null values.

         .. versionadded:: 2.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame(
         ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
         >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
         [Row(name=None), Row(name='Tom'), Row(name='Alice')]

         """
         ...

     @dispatch_col_method
     def desc_nulls_last(self) -> "Column":
         """
         Returns a sort expression based on the descending order of the column, and null values
         appear after non-null values.

         .. versionadded:: 2.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame(
         ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
         >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
         [Row(name='Tom'), Row(name='Alice'), Row(name=None)]
         """
         ...

     @dispatch_col_method
     def isNull(self) -> "Column":
         """
         True if the current expression is null.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
         >>> df.filter(df.height.isNull()).collect()
         [Row(name='Alice', height=None)]
         """
         ...

     @dispatch_col_method
     def isNotNull(self) -> "Column":
         """
         True if the current expression is NOT null.

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
         >>> df.filter(df.height.isNotNull()).collect()
         [Row(name='Tom', height=80)]
         """
         ...

     @dispatch_col_method
     def isNaN(self) -> "Column":
         """
         True if the current expression is NaN.

         .. versionadded:: 4.0.0

         Examples
         --------
         >>> from pyspark.sql import Row
         >>> df = spark.createDataFrame(
         ...     [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))])
         >>> df.filter(df.height.isNaN()).collect()
         [Row(name='Alice', height=nan)]
         """
         ...

     @dispatch_col_method
     def alias(self, *alias: str, **kwargs: Any) -> "Column":
         """
         Returns this column aliased with a new name or names (in the case of expressions that
         return more than one column, such as explode).

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         alias : str
             desired column names (collects all positional arguments passed)

         Other Parameters
         ----------------
         metadata: dict
             a dict of information to be stored in ``metadata`` attribute of the
             corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword
             only argument)

             .. versionchanged:: 2.2.0
                Added optional ``metadata`` argument.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column is aliased with new name or names.

         Examples
         --------
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.age.alias("age2")).collect()
         [Row(age2=2), Row(age2=5)]
         >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max']
         99
         """
         ...

     @dispatch_col_method
     def name(self, *alias: str, **kwargs: Any) -> "Column":
         """
         :func:`name` is an alias for :func:`alias`.

         .. versionadded:: 2.0.0
         """
         ...

     @dispatch_col_method
     def cast(self, dataType: Union[DataType, str]) -> "Column":
         """
         Casts the column into type ``dataType``.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         dataType : :class:`DataType` or str
             a DataType or Python string literal with a DDL-formatted string
             to use when parsing the column to the same type.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column is cast into new type.

         Examples
         --------
         >>> from pyspark.sql.types import StringType
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.age.cast("string").alias('ages')).collect()
         [Row(ages='2'), Row(ages='5')]
         >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
         [Row(ages='2'), Row(ages='5')]
         """
         ...

     @dispatch_col_method
     def try_cast(self, dataType: Union[DataType, str]) -> "Column":
         """
         This is a special version of `cast` that performs the same operation, but returns a NULL
         value instead of raising an error if the invoke method throws exception.

         .. versionadded:: 4.0.0

         Parameters
         ----------
         dataType : :class:`DataType` or str
             a DataType or Python string literal with a DDL-formatted string
             to use when parsing the column to the same type.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column is cast into new type.

         Examples
         --------
         Example 1: Cast with a Datatype

         >>> from pyspark.sql.types import LongType
         >>> df = spark.createDataFrame(
         ...      [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"])
         >>> df.select(df.name.try_cast(LongType())).show()
         +----+
         |name|
         +----+
         | 123|
         |NULL|
         |NULL|
         +----+

         Example 2: Cast with a DDL string

         >>> df = spark.createDataFrame(
         ...      [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"])
         >>> df.select(df.name.try_cast("double")).show()
         +-----+
         | name|
         +-----+
         |123.0|
         | NULL|
         | NULL|
         +-----+
         """
         ...

     @dispatch_col_method
     def astype(self, dataType: Union[DataType, str]) -> "Column":
         """
         :func:`astype` is an alias for :func:`cast`.

         .. versionadded:: 1.4.0
         """
         ...

     @dispatch_col_method
     def between(
         self,
         lowerBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"],
         upperBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"],
     ) -> "Column":
         """
         Check if the current column's values are between the specified lower and upper
         bounds, inclusive.

         .. versionadded:: 1.3.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
             The lower boundary value, inclusive.
         upperBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
             The upper boundary value, inclusive.

         Returns
         -------
         :class:`Column`
             A new column of boolean values indicating whether each element in the original
             column is within the specified range (inclusive).

         Examples
         --------
         Using between with integer values.

         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name, df.age.between(2, 4)).show()
         +-----+---------------------------+
         | name|((age >= 2) AND (age <= 4))|
         +-----+---------------------------+
         |Alice|                       true|
         |  Bob|                      false|
         +-----+---------------------------+

         Using between with string values.

         >>> df = spark.createDataFrame([("Alice", "A"), ("Bob", "B")], ["name", "initial"])
         >>> df.select(df.name, df.initial.between("A", "B")).show()
         +-----+-----------------------------------+
         | name|((initial >= A) AND (initial <= B))|
         +-----+-----------------------------------+
         |Alice|                               true|
         |  Bob|                               true|
         +-----+-----------------------------------+

         Using between with float values.

         >>> df = spark.createDataFrame(
         ...     [(2.5, "Alice"), (5.5, "Bob")], ["height", "name"])
         >>> df.select(df.name, df.height.between(2.0, 5.0)).show()
         +-----+-------------------------------------+
         | name|((height >= 2.0) AND (height <= 5.0))|
         +-----+-------------------------------------+
         |Alice|                                 true|
         |  Bob|                                false|
         +-----+-------------------------------------+

         Using between with date values.

         >>> import pyspark.sql.functions as sf
         >>> df = spark.createDataFrame(
         ...     [("Alice", "2023-01-01"), ("Bob", "2023-02-01")], ["name", "date"])
         >>> df = df.withColumn("date", sf.to_date(df.date))
         >>> df.select(df.name, df.date.between("2023-01-01", "2023-01-15")).show()
         +-----+-----------------------------------------------+
         | name|((date >= 2023-01-01) AND (date <= 2023-01-15))|
         +-----+-----------------------------------------------+
         |Alice|                                           true|
         |  Bob|                                          false|
         +-----+-----------------------------------------------+
         >>> from datetime import date
         >>> df.select(df.name, df.date.between(date(2023, 1, 1), date(2023, 1, 15))).show()
         +-----+-------------------------------------------------------------+
         | name|((date >= DATE '2023-01-01') AND (date <= DATE '2023-01-15'))|
         +-----+-------------------------------------------------------------+
         |Alice|                                                         true|
         |  Bob|                                                        false|
         +-----+-------------------------------------------------------------+

         Using between with timestamp values.

         >>> import pyspark.sql.functions as sf
         >>> df = spark.createDataFrame(
         ...     [("Alice", "2023-01-01 10:00:00"), ("Bob", "2023-02-01 10:00:00")],
         ...     schema=["name", "timestamp"])
         >>> df = df.withColumn("timestamp", sf.to_timestamp(df.timestamp))
         >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01")).show()
         +-----+---------------------------------------------------------+
         | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01))|
         +-----+---------------------------------------------------------+
         |Alice|                                                     true|
         |  Bob|                                                    false|
         +-----+---------------------------------------------------------+
         >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01 12:00:00")).show()
         +-----+------------------------------------------------------------------+
         | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01 12:00:00))|
         +-----+------------------------------------------------------------------+
         |Alice|                                                              true|
         |  Bob|                                                              true|
         +-----+------------------------------------------------------------------+
         """
         ...

     @dispatch_col_method
     def when(self, condition: "Column", value: Any) -> "Column":
         """
         Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

         .. versionadded:: 1.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         condition : :class:`Column`
             a boolean :class:`Column` expression.
         value
             a literal value, or a :class:`Column` expression.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column is in conditions.

         Examples
         --------
         Example 1: Using :func:`when` with conditions and values to create a new Column

         >>> from pyspark.sql import functions as sf
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> result = df.select(df.name, sf.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0))
         >>> result.show()
         +-----+------------------------------------------------------------+
         | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END|
         +-----+------------------------------------------------------------+
         |Alice|                                                          -1|
         |  Bob|                                                           1|
         +-----+------------------------------------------------------------+

         Example 2: Chaining multiple :func:`when` conditions

         >>> from pyspark.sql import functions as sf
         >>> df = spark.createDataFrame([(1, "Alice"), (4, "Bob"), (6, "Charlie")], ["age", "name"])
         >>> result = df.select(
         ...     df.name,
         ...     sf.when(df.age < 3, "Young").when(df.age < 5, "Middle-aged").otherwise("Old")
         ... )
         >>> result.show()
         +-------+---------------------------------------------------------------------------+
         |   name|CASE WHEN (age < 3) THEN Young WHEN (age < 5) THEN Middle-aged ELSE Old END|
         +-------+---------------------------------------------------------------------------+
         |  Alice|                                                                      Young|
         |    Bob|                                                                Middle-aged|
         |Charlie|                                                                        Old|
         +-------+---------------------------------------------------------------------------+

         Example 3: Using literal values as conditions

         >>> from pyspark.sql import functions as sf
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> result = df.select(
         ...     df.name, sf.when(sf.lit(True), 1).otherwise(
         ...         sf.raise_error("unreachable")).alias("when"))
         >>> result.show()
         +-----+----+
         | name|when|
         +-----+----+
         |Alice|   1|
         |  Bob|   1|
         +-----+----+

         See Also
         --------
         pyspark.sql.functions.when
         """
         ...

     @dispatch_col_method
     def otherwise(self, value: Any) -> "Column":
         """
         Evaluates a list of conditions and returns one of multiple possible result expressions.
         If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

         .. versionadded:: 1.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         value
             a literal value, or a :class:`Column` expression.

         Returns
         -------
         :class:`Column`
             Column representing whether each element of Column is unmatched conditions.

         Examples
         --------
         >>> from pyspark.sql import functions as sf
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.select(df.name, sf.when(df.age > 3, 1).otherwise(0)).show()
         +-----+-------------------------------------+
         | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|
         +-----+-------------------------------------+
         |Alice|                                    0|
         |  Bob|                                    1|
         +-----+-------------------------------------+

         See Also
         --------
         pyspark.sql.functions.when
         """
         ...

     @dispatch_col_method
     def over(self, window: "WindowSpec") -> "Column":
         """
         Define a windowing column.

         .. versionadded:: 1.4.0

         .. versionchanged:: 3.4.0
             Supports Spark Connect.

         Parameters
         ----------
         window : :class:`WindowSpec`

         Returns
         -------
         :class:`Column`

         Examples
         --------
         >>> from pyspark.sql import Window
         >>> window = (
         ...     Window.partitionBy("name")
         ...     .orderBy("age")
         ...     .rowsBetween(Window.unboundedPreceding, Window.currentRow)
         ... )
         >>> from pyspark.sql.functions import rank, min, desc
         >>> df = spark.createDataFrame(
         ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
         >>> df.withColumn(
         ...      "rank", rank().over(window)
         ... ).withColumn(
         ...      "min", min('age').over(window)
         ... ).sort(desc("age")).show()
         +---+-----+----+---+
         |age| name|rank|min|
         +---+-----+----+---+
         |  5|  Bob|   1|  5|
         |  2|Alice|   1|  2|
         +---+-----+----+---+
         """
         ...

     @dispatch_col_method
     def __nonzero__(self) -> None:
         ...

     @dispatch_col_method
     def __bool__(self) -> None:
         ...

     @dispatch_col_method
     def __repr__(self) -> str:
         ...


 def _test() -> None:
     import doctest
     from pyspark.sql import SparkSession
     import pyspark.sql.column

     globs = pyspark.sql.column.__dict__.copy()
     spark = SparkSession.builder.master("local[4]").appName("sql.column tests").getOrCreate()
     globs["spark"] = spark

     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.column,
         globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
     )
     spark.stop()
     if failure_count:
         sys.exit(-1)


 if __name__ == "__main__":
     _test()