blob: 4ea621b626bb8f3cc1dff863dc6d164a1962d02b [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# mypy: disable-error-code="empty-body"
import sys
from typing import (
overload,
Any,
TYPE_CHECKING,
Union,
)
from pyspark.sql.utils import dispatch_col_method
from pyspark.sql.types import DataType
from pyspark.errors import PySparkValueError
if TYPE_CHECKING:
from py4j.java_gateway import JavaObject
from pyspark.sql._typing import LiteralType, DecimalLiteral, DateTimeLiteral
from pyspark.sql.window import WindowSpec
__all__ = ["Column"]
class Column:
"""
A column in a DataFrame.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
Column instances can be created by
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
Select a column out of a DataFrame
>>> df.name
Column<'name'>
>>> df["name"]
Column<'name'>
Create from an expression
>>> df.age + 1
Column<...>
>>> 1 / df.age
Column<...>
"""
# HACK ALERT!! this is to reduce the backward compatibility concern, and returns
# Spark Classic Column by default. This is NOT an API, and NOT supposed to
# be directly invoked. DO NOT use this constructor.
def __new__(
cls,
jc: "JavaObject",
) -> "Column":
from pyspark.sql.classic.column import Column
return Column.__new__(Column, jc)
def __init__(self, jc: "JavaObject") -> None:
self._jc = jc
# arithmetic operators
@dispatch_col_method
def __neg__(self) -> "Column":
...
@dispatch_col_method
def __add__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __sub__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __mul__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __div__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __truediv__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __mod__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __radd__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __rsub__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __rmul__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __rdiv__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __rtruediv__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __rmod__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __pow__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __rpow__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
# logistic operators
@dispatch_col_method
def __eq__( # type: ignore[override]
self,
other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"],
) -> "Column":
"""binary function"""
...
@dispatch_col_method
def __ne__( # type: ignore[override]
self,
other: Any,
) -> "Column":
"""binary function"""
...
@dispatch_col_method
def __lt__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __le__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __ge__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __gt__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def eqNullSafe(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
"""
Equality test that is safe for null values.
.. versionadded:: 2.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other
a value or :class:`Column`
Examples
--------
>>> from pyspark.sql import Row
>>> df1 = spark.createDataFrame([
... Row(id=1, value='foo'),
... Row(id=2, value=None)
... ])
>>> df1.select(
... df1['value'] == 'foo',
... df1['value'].eqNullSafe('foo'),
... df1['value'].eqNullSafe(None)
... ).show()
+-------------+---------------+----------------+
|(value = foo)|(value <=> foo)|(value <=> NULL)|
+-------------+---------------+----------------+
| true| true| false|
| NULL| false| true|
+-------------+---------------+----------------+
>>> df2 = spark.createDataFrame([
... Row(value = 'bar'),
... Row(value = None)
... ])
>>> df1.join(df2, df1["value"] == df2["value"]).count()
0
>>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()
1
>>> df2 = spark.createDataFrame([
... Row(id=1, value=float('NaN')),
... Row(id=2, value=42.0),
... Row(id=3, value=None)
... ])
>>> df2.select(
... df2['value'].eqNullSafe(None),
... df2['value'].eqNullSafe(float('NaN')),
... df2['value'].eqNullSafe(42.0)
... ).show()
+----------------+---------------+----------------+
|(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
+----------------+---------------+----------------+
| false| true| false|
| false| false| true|
| true| false| false|
+----------------+---------------+----------------+
Notes
-----
Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the
`NaN Semantics <https://spark.apache.org/docs/latest/sql-ref-datatypes.html#nan-semantics>`_
for details.
"""
...
# `and`, `or`, `not` cannot be overloaded in Python,
# so use bitwise operators as boolean operators
@dispatch_col_method
def __and__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __or__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __invert__(self) -> "Column":
...
@dispatch_col_method
def __rand__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
@dispatch_col_method
def __ror__(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
...
# container operators
@dispatch_col_method
def __contains__(self, item: Any) -> None:
raise PySparkValueError(
error_class="CANNOT_APPLY_IN_FOR_COLUMN",
message_parameters={},
)
# bitwise operators
@dispatch_col_method
def bitwiseOR(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
""" "
Compute bitwise OR of this expression with another expression.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other
a value or :class:`Column` to calculate bitwise or(|) with
this :class:`Column`.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(a=170, b=75)])
>>> df.select(df.a.bitwiseOR(df.b)).collect()
[Row((a | b)=235)]
"""
...
@dispatch_col_method
def bitwiseAND(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
"""
Compute bitwise AND of this expression with another expression.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other
a value or :class:`Column` to calculate bitwise and(&) with
this :class:`Column`.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(a=170, b=75)])
>>> df.select(df.a.bitwiseAND(df.b)).collect()
[Row((a & b)=10)]
"""
...
@dispatch_col_method
def bitwiseXOR(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
"""
Compute bitwise XOR of this expression with another expression.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other
a value or :class:`Column` to calculate bitwise xor(^) with
this :class:`Column`.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(a=170, b=75)])
>>> df.select(df.a.bitwiseXOR(df.b)).collect()
[Row((a ^ b)=225)]
"""
...
@dispatch_col_method
def getItem(self, key: Any) -> "Column":
"""
An expression that gets an item at position ``ordinal`` out of a list,
or gets an item by key out of a dict.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
key
a literal value, or a :class:`Column` expression.
The result will only be true at a location if the item matches in the column.
.. deprecated:: 3.0.0
:class:`Column` as a parameter is deprecated.
Returns
-------
:class:`Column`
Column representing the item(s) got at position out of a list or by key out of a dict.
Examples
--------
>>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
"""
...
@dispatch_col_method
def getField(self, name: Any) -> "Column":
"""
An expression that gets a field by name in a :class:`StructType`.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
name
a literal value, or a :class:`Column` expression.
The result will only be true at a location if the field matches in the Column.
.. deprecated:: 3.0.0
:class:`Column` as a parameter is deprecated.
Returns
-------
:class:`Column`
Column representing whether each element of Column got by name.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))])
>>> df.select(df.r.getField("b")).show()
+---+
|r.b|
+---+
| b|
+---+
>>> df.select(df.r.a).show()
+---+
|r.a|
+---+
| 1|
+---+
"""
...
@dispatch_col_method
def withField(self, fieldName: str, col: "Column") -> "Column":
"""
An expression that adds/replaces a field in :class:`StructType` by name.
.. versionadded:: 3.1.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
fieldName : str
a literal value.
The result will only be true at a location if any field matches in the Column.
col : :class:`Column`
A :class:`Column` expression for the column with `fieldName`.
Returns
-------
:class:`Column`
Column representing whether each element of Column
which field was added/replaced by fieldName.
Examples
--------
>>> from pyspark.sql import Row
>>> from pyspark.sql.functions import lit
>>> df = spark.createDataFrame([Row(a=Row(b=1, c=2))])
>>> df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show()
+---+
| b|
+---+
| 3|
+---+
>>> df.withColumn('a', df['a'].withField('d', lit(4))).select('a.d').show()
+---+
| d|
+---+
| 4|
+---+
"""
...
@dispatch_col_method
def dropFields(self, *fieldNames: str) -> "Column":
"""
An expression that drops fields in :class:`StructType` by name.
This is a no-op if the schema doesn't contain field name(s).
.. versionadded:: 3.1.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
fieldNames : str
Desired field names (collects all positional arguments passed)
The result will drop at a location if any field matches in the Column.
Returns
-------
:class:`Column`
Column representing whether each element of Column with field dropped by fieldName.
Examples
--------
>>> from pyspark.sql import Row
>>> from pyspark.sql.functions import col, lit
>>> df = spark.createDataFrame([
... Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])
>>> df.withColumn('a', df['a'].dropFields('b')).show()
+-----------------+
| a|
+-----------------+
|{2, 3, {4, 5, 6}}|
+-----------------+
>>> df.withColumn('a', df['a'].dropFields('b', 'c')).show()
+--------------+
| a|
+--------------+
|{3, {4, 5, 6}}|
+--------------+
This method supports dropping multiple nested fields directly e.g.
>>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show()
+--------------+
| a|
+--------------+
|{1, 2, 3, {4}}|
+--------------+
However, if you are going to add/replace multiple nested fields,
it is preferred to extract out the nested struct before
adding/replacing multiple fields e.g.
>>> df.select(col("a").withField(
... "e", col("a.e").dropFields("g", "h")).alias("a")
... ).show()
+--------------+
| a|
+--------------+
|{1, 2, 3, {4}}|
+--------------+
"""
...
@dispatch_col_method
def __getattr__(self, item: Any) -> "Column":
"""
An expression that gets an item at position ``ordinal`` out of a list,
or gets an item by key out of a dict.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
item
a literal value.
Returns
-------
:class:`Column`
Column representing the item got by key out of a dict.
Examples
--------
>>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
>>> df.select(df.d.key).show()
+------+
|d[key]|
+------+
| value|
+------+
"""
...
@dispatch_col_method
def __getitem__(self, k: Any) -> "Column":
"""
An expression that gets an item at position ``ordinal`` out of a list,
or gets an item by key out of a dict.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
k
a literal value, or a slice object without step.
Returns
-------
:class:`Column`
Column representing the item got by key out of a dict, or substrings sliced by
the given slice object.
Examples
--------
>>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
>>> df.select(df.l[slice(1, 3)], df.d['key']).show()
+---------------+------+
|substr(l, 1, 3)|d[key]|
+---------------+------+
| abc| value|
+---------------+------+
"""
...
@dispatch_col_method
def __iter__(self) -> None:
...
# string methods
@dispatch_col_method
def contains(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
"""
Contains the other element. Returns a boolean :class:`Column` based on a string match.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other
string in line. A value as a literal or a :class:`Column`.
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.filter(df.name.contains('o')).collect()
[Row(age=5, name='Bob')]
"""
...
@dispatch_col_method
def startswith(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
"""
String starts with. Returns a boolean :class:`Column` based on a string match.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other : :class:`Column` or str
string at start of line (do not use a regex `^`)
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.filter(df.name.startswith('Al')).collect()
[Row(age=2, name='Alice')]
>>> df.filter(df.name.startswith('^Al')).collect()
[]
"""
...
@dispatch_col_method
def endswith(
self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
) -> "Column":
"""
String ends with. Returns a boolean :class:`Column` based on a string match.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other : :class:`Column` or str
string at end of line (do not use a regex `$`)
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.filter(df.name.endswith('ice')).collect()
[Row(age=2, name='Alice')]
>>> df.filter(df.name.endswith('ice$')).collect()
[]
"""
...
@dispatch_col_method
def like(self: "Column", other: str) -> "Column":
"""
SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other : str
a SQL LIKE pattern
See Also
--------
pyspark.sql.Column.rlike
Returns
-------
:class:`Column`
Column of booleans showing whether each element
in the Column is matched by SQL LIKE pattern.
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.filter(df.name.like('Al%')).collect()
[Row(age=2, name='Alice')]
"""
...
@dispatch_col_method
def rlike(self: "Column", other: str) -> "Column":
"""
SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex
match.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other : str
an extended regex expression
Returns
-------
:class:`Column`
Column of booleans showing whether each element
in the Column is matched by extended regex expression.
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.filter(df.name.rlike('ice$')).collect()
[Row(age=2, name='Alice')]
"""
...
@dispatch_col_method
def ilike(self: "Column", other: str) -> "Column":
"""
SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column`
based on a case insensitive match.
.. versionadded:: 3.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
other : str
a SQL LIKE pattern
See Also
--------
pyspark.sql.Column.rlike
Returns
-------
:class:`Column`
Column of booleans showing whether each element
in the Column is matched by SQL LIKE pattern.
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.filter(df.name.ilike('%Ice')).collect()
[Row(age=2, name='Alice')]
"""
...
@overload
def substr(self, startPos: int, length: int) -> "Column":
...
@overload
def substr(self, startPos: "Column", length: "Column") -> "Column":
...
@dispatch_col_method
def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) -> "Column":
"""
Return a :class:`Column` which is a substring of the column.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
startPos : :class:`Column` or int
start position
length : :class:`Column` or int
length of the substring
Returns
-------
:class:`Column`
Column representing whether each element of Column is substr of origin Column.
Examples
--------
Example 1. Using integers for the input arguments.
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.select(df.name.substr(1, 3).alias("col")).collect()
[Row(col='Ali'), Row(col='Bob')]
Example 2. Using columns for the input arguments.
>>> df = spark.createDataFrame(
... [(3, 4, "Alice"), (2, 3, "Bob")], ["sidx", "eidx", "name"])
>>> df.select(df.name.substr(df.sidx, df.eidx).alias("col")).collect()
[Row(col='ice'), Row(col='ob')]
"""
...
@dispatch_col_method
def isin(self, *cols: Any) -> "Column":
"""
A boolean expression that is evaluated to true if the value of this
expression is contained by the evaluated values of the arguments.
.. versionadded:: 1.5.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
cols : Any
The values to compare with the column values. The result will only be true at a location
if any value matches in the Column.
Returns
-------
:class:`Column`
Column of booleans showing whether each element in the Column is contained in cols.
Examples
--------
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob"), (8, "Mike")], ["age", "name"])
Example 1: Filter rows with names in the specified values
>>> df[df.name.isin("Bob", "Mike")].show()
+---+----+
|age|name|
+---+----+
| 5| Bob|
| 8|Mike|
+---+----+
Example 2: Filter rows with ages in the specified list
>>> df[df.age.isin([1, 2, 3])].show()
+---+-----+
|age| name|
+---+-----+
| 2|Alice|
+---+-----+
Example 3: Filter rows with names not in the specified values
>>> df[~df.name.isin("Alice", "Bob")].show()
+---+----+
|age|name|
+---+----+
| 8|Mike|
+---+----+
"""
...
# order
@dispatch_col_method
def asc(self) -> "Column":
"""
Returns a sort expression based on the ascending order of the column.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc()).collect()
[Row(name='Alice'), Row(name='Tom')]
"""
...
@dispatch_col_method
def asc_nulls_first(self) -> "Column":
"""
Returns a sort expression based on ascending order of the column, and null values
return before non-null values.
.. versionadded:: 2.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame(
... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
[Row(name=None), Row(name='Alice'), Row(name='Tom')]
"""
...
@dispatch_col_method
def asc_nulls_last(self) -> "Column":
"""
Returns a sort expression based on ascending order of the column, and null values
appear after non-null values.
.. versionadded:: 2.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame(
... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
[Row(name='Alice'), Row(name='Tom'), Row(name=None)]
"""
...
@dispatch_col_method
def desc(self) -> "Column":
"""
Returns a sort expression based on the descending order of the column.
.. versionadded:: 2.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc()).collect()
[Row(name='Tom'), Row(name='Alice')]
"""
...
@dispatch_col_method
def desc_nulls_first(self) -> "Column":
"""
Returns a sort expression based on the descending order of the column, and null values
appear before non-null values.
.. versionadded:: 2.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame(
... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
[Row(name=None), Row(name='Tom'), Row(name='Alice')]
"""
...
@dispatch_col_method
def desc_nulls_last(self) -> "Column":
"""
Returns a sort expression based on the descending order of the column, and null values
appear after non-null values.
.. versionadded:: 2.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame(
... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
[Row(name='Tom'), Row(name='Alice'), Row(name=None)]
"""
...
@dispatch_col_method
def isNull(self) -> "Column":
"""
True if the current expression is null.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
>>> df.filter(df.height.isNull()).collect()
[Row(name='Alice', height=None)]
"""
...
@dispatch_col_method
def isNotNull(self) -> "Column":
"""
True if the current expression is NOT null.
.. versionchanged:: 3.4.0
Supports Spark Connect.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
>>> df.filter(df.height.isNotNull()).collect()
[Row(name='Tom', height=80)]
"""
...
@dispatch_col_method
def isNaN(self) -> "Column":
"""
True if the current expression is NaN.
.. versionadded:: 4.0.0
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame(
... [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))])
>>> df.filter(df.height.isNaN()).collect()
[Row(name='Alice', height=nan)]
"""
...
@dispatch_col_method
def alias(self, *alias: str, **kwargs: Any) -> "Column":
"""
Returns this column aliased with a new name or names (in the case of expressions that
return more than one column, such as explode).
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
alias : str
desired column names (collects all positional arguments passed)
Other Parameters
----------------
metadata: dict
a dict of information to be stored in ``metadata`` attribute of the
corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword
only argument)
.. versionchanged:: 2.2.0
Added optional ``metadata`` argument.
Returns
-------
:class:`Column`
Column representing whether each element of Column is aliased with new name or names.
Examples
--------
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.select(df.age.alias("age2")).collect()
[Row(age2=2), Row(age2=5)]
>>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max']
99
"""
...
@dispatch_col_method
def name(self, *alias: str, **kwargs: Any) -> "Column":
"""
:func:`name` is an alias for :func:`alias`.
.. versionadded:: 2.0.0
"""
...
@dispatch_col_method
def cast(self, dataType: Union[DataType, str]) -> "Column":
"""
Casts the column into type ``dataType``.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
dataType : :class:`DataType` or str
a DataType or Python string literal with a DDL-formatted string
to use when parsing the column to the same type.
Returns
-------
:class:`Column`
Column representing whether each element of Column is cast into new type.
Examples
--------
>>> from pyspark.sql.types import StringType
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.select(df.age.cast("string").alias('ages')).collect()
[Row(ages='2'), Row(ages='5')]
>>> df.select(df.age.cast(StringType()).alias('ages')).collect()
[Row(ages='2'), Row(ages='5')]
"""
...
@dispatch_col_method
def try_cast(self, dataType: Union[DataType, str]) -> "Column":
"""
This is a special version of `cast` that performs the same operation, but returns a NULL
value instead of raising an error if the invoke method throws exception.
.. versionadded:: 4.0.0
Parameters
----------
dataType : :class:`DataType` or str
a DataType or Python string literal with a DDL-formatted string
to use when parsing the column to the same type.
Returns
-------
:class:`Column`
Column representing whether each element of Column is cast into new type.
Examples
--------
Example 1: Cast with a Datatype
>>> from pyspark.sql.types import LongType
>>> df = spark.createDataFrame(
... [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"])
>>> df.select(df.name.try_cast(LongType())).show()
+----+
|name|
+----+
| 123|
|NULL|
|NULL|
+----+
Example 2: Cast with a DDL string
>>> df = spark.createDataFrame(
... [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"])
>>> df.select(df.name.try_cast("double")).show()
+-----+
| name|
+-----+
|123.0|
| NULL|
| NULL|
+-----+
"""
...
@dispatch_col_method
def astype(self, dataType: Union[DataType, str]) -> "Column":
"""
:func:`astype` is an alias for :func:`cast`.
.. versionadded:: 1.4.0
"""
...
@dispatch_col_method
def between(
self,
lowerBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"],
upperBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"],
) -> "Column":
"""
Check if the current column's values are between the specified lower and upper
bounds, inclusive.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
The lower boundary value, inclusive.
upperBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
The upper boundary value, inclusive.
Returns
-------
:class:`Column`
A new column of boolean values indicating whether each element in the original
column is within the specified range (inclusive).
Examples
--------
Using between with integer values.
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.select(df.name, df.age.between(2, 4)).show()
+-----+---------------------------+
| name|((age >= 2) AND (age <= 4))|
+-----+---------------------------+
|Alice| true|
| Bob| false|
+-----+---------------------------+
Using between with string values.
>>> df = spark.createDataFrame([("Alice", "A"), ("Bob", "B")], ["name", "initial"])
>>> df.select(df.name, df.initial.between("A", "B")).show()
+-----+-----------------------------------+
| name|((initial >= A) AND (initial <= B))|
+-----+-----------------------------------+
|Alice| true|
| Bob| true|
+-----+-----------------------------------+
Using between with float values.
>>> df = spark.createDataFrame(
... [(2.5, "Alice"), (5.5, "Bob")], ["height", "name"])
>>> df.select(df.name, df.height.between(2.0, 5.0)).show()
+-----+-------------------------------------+
| name|((height >= 2.0) AND (height <= 5.0))|
+-----+-------------------------------------+
|Alice| true|
| Bob| false|
+-----+-------------------------------------+
Using between with date values.
>>> import pyspark.sql.functions as sf
>>> df = spark.createDataFrame(
... [("Alice", "2023-01-01"), ("Bob", "2023-02-01")], ["name", "date"])
>>> df = df.withColumn("date", sf.to_date(df.date))
>>> df.select(df.name, df.date.between("2023-01-01", "2023-01-15")).show()
+-----+-----------------------------------------------+
| name|((date >= 2023-01-01) AND (date <= 2023-01-15))|
+-----+-----------------------------------------------+
|Alice| true|
| Bob| false|
+-----+-----------------------------------------------+
>>> from datetime import date
>>> df.select(df.name, df.date.between(date(2023, 1, 1), date(2023, 1, 15))).show()
+-----+-------------------------------------------------------------+
| name|((date >= DATE '2023-01-01') AND (date <= DATE '2023-01-15'))|
+-----+-------------------------------------------------------------+
|Alice| true|
| Bob| false|
+-----+-------------------------------------------------------------+
Using between with timestamp values.
>>> import pyspark.sql.functions as sf
>>> df = spark.createDataFrame(
... [("Alice", "2023-01-01 10:00:00"), ("Bob", "2023-02-01 10:00:00")],
... schema=["name", "timestamp"])
>>> df = df.withColumn("timestamp", sf.to_timestamp(df.timestamp))
>>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01")).show()
+-----+---------------------------------------------------------+
| name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01))|
+-----+---------------------------------------------------------+
|Alice| true|
| Bob| false|
+-----+---------------------------------------------------------+
>>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01 12:00:00")).show()
+-----+------------------------------------------------------------------+
| name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01 12:00:00))|
+-----+------------------------------------------------------------------+
|Alice| true|
| Bob| true|
+-----+------------------------------------------------------------------+
"""
...
@dispatch_col_method
def when(self, condition: "Column", value: Any) -> "Column":
"""
Evaluates a list of conditions and returns one of multiple possible result expressions.
If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
.. versionadded:: 1.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
condition : :class:`Column`
a boolean :class:`Column` expression.
value
a literal value, or a :class:`Column` expression.
Returns
-------
:class:`Column`
Column representing whether each element of Column is in conditions.
Examples
--------
Example 1: Using :func:`when` with conditions and values to create a new Column
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> result = df.select(df.name, sf.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0))
>>> result.show()
+-----+------------------------------------------------------------+
| name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END|
+-----+------------------------------------------------------------+
|Alice| -1|
| Bob| 1|
+-----+------------------------------------------------------------+
Example 2: Chaining multiple :func:`when` conditions
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(1, "Alice"), (4, "Bob"), (6, "Charlie")], ["age", "name"])
>>> result = df.select(
... df.name,
... sf.when(df.age < 3, "Young").when(df.age < 5, "Middle-aged").otherwise("Old")
... )
>>> result.show()
+-------+---------------------------------------------------------------------------+
| name|CASE WHEN (age < 3) THEN Young WHEN (age < 5) THEN Middle-aged ELSE Old END|
+-------+---------------------------------------------------------------------------+
| Alice| Young|
| Bob| Middle-aged|
|Charlie| Old|
+-------+---------------------------------------------------------------------------+
Example 3: Using literal values as conditions
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> result = df.select(
... df.name, sf.when(sf.lit(True), 1).otherwise(
... sf.raise_error("unreachable")).alias("when"))
>>> result.show()
+-----+----+
| name|when|
+-----+----+
|Alice| 1|
| Bob| 1|
+-----+----+
See Also
--------
pyspark.sql.functions.when
"""
...
@dispatch_col_method
def otherwise(self, value: Any) -> "Column":
"""
Evaluates a list of conditions and returns one of multiple possible result expressions.
If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.
.. versionadded:: 1.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
value
a literal value, or a :class:`Column` expression.
Returns
-------
:class:`Column`
Column representing whether each element of Column is unmatched conditions.
Examples
--------
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.select(df.name, sf.when(df.age > 3, 1).otherwise(0)).show()
+-----+-------------------------------------+
| name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|
+-----+-------------------------------------+
|Alice| 0|
| Bob| 1|
+-----+-------------------------------------+
See Also
--------
pyspark.sql.functions.when
"""
...
@dispatch_col_method
def over(self, window: "WindowSpec") -> "Column":
"""
Define a windowing column.
.. versionadded:: 1.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
window : :class:`WindowSpec`
Returns
-------
:class:`Column`
Examples
--------
>>> from pyspark.sql import Window
>>> window = (
... Window.partitionBy("name")
... .orderBy("age")
... .rowsBetween(Window.unboundedPreceding, Window.currentRow)
... )
>>> from pyspark.sql.functions import rank, min, desc
>>> df = spark.createDataFrame(
... [(2, "Alice"), (5, "Bob")], ["age", "name"])
>>> df.withColumn(
... "rank", rank().over(window)
... ).withColumn(
... "min", min('age').over(window)
... ).sort(desc("age")).show()
+---+-----+----+---+
|age| name|rank|min|
+---+-----+----+---+
| 5| Bob| 1| 5|
| 2|Alice| 1| 2|
+---+-----+----+---+
"""
...
@dispatch_col_method
def __nonzero__(self) -> None:
...
@dispatch_col_method
def __bool__(self) -> None:
...
@dispatch_col_method
def __repr__(self) -> str:
...
def _test() -> None:
import doctest
from pyspark.sql import SparkSession
import pyspark.sql.column
globs = pyspark.sql.column.__dict__.copy()
spark = SparkSession.builder.master("local[4]").appName("sql.column tests").getOrCreate()
globs["spark"] = spark
(failure_count, test_count) = doctest.testmod(
pyspark.sql.column,
globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
)
spark.stop()
if failure_count:
sys.exit(-1)
if __name__ == "__main__":
_test()