| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| """ |
| A collections of builtin functions |
| """ |
| import inspect |
| import decimal |
| import sys |
| import functools |
| import warnings |
| from typing import ( |
| Any, |
| cast, |
| Callable, |
| Mapping, |
| Sequence, |
| Iterable, |
| overload, |
| Optional, |
| Tuple, |
| Type, |
| TYPE_CHECKING, |
| Union, |
| ValuesView, |
| ) |
| |
| from pyspark.errors import PySparkTypeError, PySparkValueError |
| from pyspark.errors.utils import _with_origin |
| from pyspark.sql.column import Column |
| from pyspark.sql.types import ( |
| ArrayType, |
| ByteType, |
| DataType, |
| StringType, |
| StructType, |
| NumericType, |
| _from_numpy_type, |
| ) |
| |
| # Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409 |
| from pyspark.sql.udf import UserDefinedFunction, _create_py_udf # noqa: F401 |
| from pyspark.sql.udtf import AnalyzeArgument, AnalyzeResult # noqa: F401 |
| from pyspark.sql.udtf import OrderingColumn, PartitioningColumn, SelectedColumn # noqa: F401 |
| from pyspark.sql.udtf import SkipRestOfInputTableException # noqa: F401 |
| from pyspark.sql.udtf import UserDefinedTableFunction, _create_py_udtf, _create_pyarrow_udtf |
| |
| # Keep pandas_udf and PandasUDFType import for backwards compatible import; moved in SPARK-28264 |
| from pyspark.sql.pandas.functions import ( # noqa: F401 |
| arrow_udf, # noqa: F401 |
| pandas_udf, # noqa: F401 |
| ArrowUDFType, # noqa: F401 |
| PandasUDFType, # noqa: F401 |
| ) # noqa: F401 |
| |
| from pyspark.sql.utils import ( |
| to_str as _to_str, |
| try_remote_functions as _try_remote_functions, |
| get_active_spark_context as _get_active_spark_context, |
| enum_to_value as _enum_to_value, |
| ) |
| |
| if TYPE_CHECKING: |
| from pyspark import SparkContext |
| from pyspark.sql.dataframe import DataFrame |
| from pyspark.sql._typing import ( |
| ColumnOrName, |
| DataTypeOrString, |
| UserDefinedFunctionLike, |
| ) |
| |
| |
| # Note to developers: all of PySpark functions here take string as column names whenever possible. |
| # Namely, if columns are referred as arguments, they can always be both Column or string, |
| # even though there might be few exceptions for legacy or inevitable reasons. |
| # If you are fixing other language APIs together, also please note that Scala side is not the case |
| # since it requires making every single overridden definition. |
| |
| |
| def _get_jvm_function(name: str, sc: "SparkContext") -> Callable: |
| """ |
| Retrieves JVM function identified by name from |
| Java gateway associated with sc. |
| """ |
| assert sc._jvm is not None |
| return getattr(getattr(sc._jvm, "org.apache.spark.sql.functions"), name) |
| |
| |
| def _invoke_function(name: str, *args: Any) -> Column: |
| """ |
| Invokes JVM function identified by name with args |
| and wraps the result with :class:`~pyspark.sql.Column`. |
| """ |
| from pyspark import SparkContext |
| |
| assert SparkContext._active_spark_context is not None |
| jf = _get_jvm_function(name, SparkContext._active_spark_context) |
| return Column(jf(*args)) |
| |
| |
| def _invoke_function_over_columns(name: str, *cols: "ColumnOrName") -> Column: |
| """ |
| Invokes n-ary JVM function identified by name |
| and wraps the result with :class:`~pyspark.sql.Column`. |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function(name, *(_to_java_column(col) for col in cols)) |
| |
| |
| def _invoke_function_over_seq_of_columns(name: str, cols: "Iterable[ColumnOrName]") -> Column: |
| """ |
| Invokes unary JVM function identified by name with |
| and wraps the result with :class:`~pyspark.sql.Column`. |
| """ |
| from pyspark.sql.classic.column import _to_java_column, _to_seq |
| |
| sc = _get_active_spark_context() |
| return _invoke_function(name, _to_seq(sc, cols, _to_java_column)) |
| |
| |
| def _invoke_binary_math_function(name: str, col1: Any, col2: Any) -> Column: |
| """ |
| Invokes binary JVM math function identified by name |
| and wraps the result with :class:`~pyspark.sql.Column`. |
| """ |
| from pyspark.sql.classic.column import _to_java_column, _create_column_from_literal |
| |
| # For legacy reasons, the arguments here can be implicitly converted into column |
| cols = [ |
| _to_java_column(c) if isinstance(c, (str, Column)) else _create_column_from_literal(c) |
| for c in (col1, col2) |
| ] |
| return _invoke_function(name, *cols) |
| |
| |
| def _options_to_str(options: Optional[Mapping[str, Any]] = None) -> Mapping[str, Optional[str]]: |
| if options: |
| return {key: _to_str(value) for (key, value) in options.items()} |
| return {} |
| |
| |
| @_try_remote_functions |
| def lit(col: Any) -> Column: |
| """ |
| Creates a :class:`~pyspark.sql.Column` of literal value. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals or ndarray. |
| the value to make it as a PySpark literal. If a column is passed, |
| it returns the column as is. |
| |
| .. versionchanged:: 3.4.0 |
| Since 3.4.0, it supports the list type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the literal instance. |
| |
| Examples |
| -------- |
| Example 1: Creating a literal column with an integer value. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.lit(5).alias('height'), df.id).show() |
| +------+---+ |
| |height| id| |
| +------+---+ |
| | 5| 0| |
| +------+---+ |
| |
| Example 2: Creating a literal column from a list. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.lit([1, 2, 3])).show() |
| +--------------+ |
| |array(1, 2, 3)| |
| +--------------+ |
| | [1, 2, 3]| |
| +--------------+ |
| |
| Example 3: Creating a literal column from a string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.lit("PySpark").alias('framework'), df.id).show() |
| +---------+---+ |
| |framework| id| |
| +---------+---+ |
| | PySpark| 0| |
| +---------+---+ |
| |
| Example 4: Creating a literal column from a boolean value. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(True, "Yes"), (False, "No")], ["flag", "response"]) |
| >>> df.select(sf.lit(False).alias('is_approved'), df.response).show() |
| +-----------+--------+ |
| |is_approved|response| |
| +-----------+--------+ |
| | false| Yes| |
| | false| No| |
| +-----------+--------+ |
| |
| Example 5: Creating literal columns from Numpy scalar. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> import numpy as np # doctest: +SKIP |
| >>> spark.range(1).select( |
| ... sf.lit(np.bool_(True)), |
| ... sf.lit(np.int64(123)), |
| ... sf.lit(np.float64(0.456)), |
| ... sf.lit(np.str_("xyz")) |
| ... ).show() # doctest: +SKIP |
| +----+---+-----+---+ |
| |true|123|0.456|xyz| |
| +----+---+-----+---+ |
| |true|123|0.456|xyz| |
| +----+---+-----+---+ |
| |
| Example 6: Creating literal columns from Numpy ndarray. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> import numpy as np # doctest: +SKIP |
| >>> spark.range(1).select( |
| ... sf.lit(np.array([True, False], np.bool_)), |
| ... sf.lit(np.array([], np.int8)), |
| ... sf.lit(np.array([1.5, 0.1], np.float64)), |
| ... sf.lit(np.array(["a", "b", "c"], np.str_)), |
| ... ).show() # doctest: +SKIP |
| +------------------+-------+-----------------+--------------------+ |
| |ARRAY(true, false)|ARRAY()|ARRAY(1.5D, 0.1D)|ARRAY('a', 'b', 'c')| |
| +------------------+-------+-----------------+--------------------+ |
| | [true, false]| []| [1.5, 0.1]| [a, b, c]| |
| +------------------+-------+-----------------+--------------------+ |
| """ |
| from pyspark.testing.utils import have_numpy |
| |
| if isinstance(col, Column): |
| return col |
| elif isinstance(col, list): |
| if any(isinstance(c, Column) for c in col): |
| raise PySparkValueError( |
| errorClass="COLUMN_IN_LIST", messageParameters={"func_name": "lit"} |
| ) |
| return array(*[lit(item) for item in col]) |
| elif have_numpy: |
| import numpy as np |
| |
| if isinstance(col, np.generic): |
| dt = _from_numpy_type(col.dtype) |
| if dt is None: |
| raise PySparkTypeError( |
| errorClass="UNSUPPORTED_NUMPY_ARRAY_SCALAR", |
| messageParameters={"dtype": col.dtype.name}, |
| ) |
| if isinstance(dt, NumericType): |
| # NumpyScalarConverter for Py4J converts numeric scalar to Python scalar. |
| # E.g. numpy.int64(1) is converted to int(1). |
| # So, we need to cast it back to the original type. |
| return _invoke_function("lit", col).astype(dt).alias(str(col)) |
| else: |
| return _invoke_function("lit", col) |
| elif isinstance(col, np.ndarray) and col.ndim == 1: |
| dt = _from_numpy_type(col.dtype) |
| if dt is None: |
| raise PySparkTypeError( |
| errorClass="UNSUPPORTED_NUMPY_ARRAY_SCALAR", |
| messageParameters={"dtype": col.dtype.name}, |
| ) |
| if isinstance(dt, ByteType): |
| # NumpyArrayConverter for Py4J converts Array[Byte] to Array[Short]. |
| # Cast it back to ByteType. |
| return _invoke_function("lit", col).cast(ArrayType(dt)) |
| else: |
| return _invoke_function("lit", col) |
| return _invoke_function("lit", _enum_to_value(col)) |
| |
| |
| @_try_remote_functions |
| @_with_origin |
| def col(col: str) -> Column: |
| """ |
| Returns a :class:`~pyspark.sql.Column` based on the given column name. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : column name |
| the name for the column |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the corresponding column instance. |
| |
| Examples |
| -------- |
| >>> col('x') |
| Column<'x'> |
| >>> column('x') |
| Column<'x'> |
| """ |
| return _invoke_function("col", col) |
| |
| |
| column = col |
| |
| |
| @_try_remote_functions |
| def asc(col: "ColumnOrName") -> Column: |
| """ |
| Returns a sort expression for the target column in ascending order. |
| This function is used in `sort` and `orderBy` functions. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Target column to sort by in the ascending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The column specifying the sort order. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.asc_nulls_first` |
| :meth:`pyspark.sql.functions.asc_nulls_last` |
| |
| Examples |
| -------- |
| Example 1: Sort DataFrame by 'id' column in ascending order. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(4, 'B'), (3, 'A'), (2, 'C')], ['id', 'value']) |
| >>> df.sort(sf.asc("id")).show() |
| +---+-----+ |
| | id|value| |
| +---+-----+ |
| | 2| C| |
| | 3| A| |
| | 4| B| |
| +---+-----+ |
| |
| Example 2: Use `asc` in `orderBy` function to sort the DataFrame. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(4, 'B'), (3, 'A'), (2, 'C')], ['id', 'value']) |
| >>> df.orderBy(sf.asc("value")).show() |
| +---+-----+ |
| | id|value| |
| +---+-----+ |
| | 3| A| |
| | 4| B| |
| | 2| C| |
| +---+-----+ |
| |
| Example 3: Combine `asc` with `desc` to sort by multiple columns. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(2, 'A', 4), (1, 'B', 3), (3, 'A', 2)], |
| ... ['id', 'group', 'value']) |
| >>> df.sort(sf.asc("group"), sf.desc("value")).show() |
| +---+-----+-----+ |
| | id|group|value| |
| +---+-----+-----+ |
| | 2| A| 4| |
| | 3| A| 2| |
| | 1| B| 3| |
| +---+-----+-----+ |
| |
| Example 4: Implement `asc` from column expression. |
| |
| >>> df = spark.createDataFrame([(4, 'B'), (3, 'A'), (2, 'C')], ['id', 'value']) |
| >>> df.sort(df.id.asc()).show() |
| +---+-----+ |
| | id|value| |
| +---+-----+ |
| | 2| C| |
| | 3| A| |
| | 4| B| |
| +---+-----+ |
| """ |
| return col.asc() if isinstance(col, Column) else _invoke_function("asc", col) |
| |
| |
| @_try_remote_functions |
| def desc(col: "ColumnOrName") -> Column: |
| """ |
| Returns a sort expression for the target column in descending order. |
| This function is used in `sort` and `orderBy` functions. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Target column to sort by in the descending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The column specifying the sort order. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.desc_nulls_first` |
| :meth:`pyspark.sql.functions.desc_nulls_last` |
| |
| Examples |
| -------- |
| Example 1: Sort DataFrame by 'id' column in descending order. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(4, 'B'), (3, 'A'), (2, 'C')], ['id', 'value']) |
| >>> df.sort(sf.desc("id")).show() |
| +---+-----+ |
| | id|value| |
| +---+-----+ |
| | 4| B| |
| | 3| A| |
| | 2| C| |
| +---+-----+ |
| |
| Example 2: Use `desc` in `orderBy` function to sort the DataFrame. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(4, 'B'), (3, 'A'), (2, 'C')], ['id', 'value']) |
| >>> df.orderBy(sf.desc("value")).show() |
| +---+-----+ |
| | id|value| |
| +---+-----+ |
| | 2| C| |
| | 4| B| |
| | 3| A| |
| +---+-----+ |
| |
| Example 3: Combine `asc` with `desc` to sort by multiple columns. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(2, 'A', 4), (1, 'B', 3), (3, 'A', 2)], |
| ... ['id', 'group', 'value']) |
| >>> df.sort(sf.desc("group"), sf.asc("value")).show() |
| +---+-----+-----+ |
| | id|group|value| |
| +---+-----+-----+ |
| | 1| B| 3| |
| | 3| A| 2| |
| | 2| A| 4| |
| +---+-----+-----+ |
| |
| Example 4: Implement `desc` from column expression. |
| |
| >>> df = spark.createDataFrame([(4, 'B'), (3, 'A'), (2, 'C')], ['id', 'value']) |
| >>> df.sort(df.id.desc()).show() |
| +---+-----+ |
| | id|value| |
| +---+-----+ |
| | 4| B| |
| | 3| A| |
| | 2| C| |
| +---+-----+ |
| """ |
| return col.desc() if isinstance(col, Column) else _invoke_function("desc", col) |
| |
| |
| @_try_remote_functions |
| def sqrt(col: "ColumnOrName") -> Column: |
| """ |
| Computes the square root of the specified float value. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| column for computed results. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-1), (0), (1), (4), (NULL) AS TAB(value)" |
| ... ).select("*", sf.sqrt("value")).show() |
| +-----+-----------+ |
| |value|SQRT(value)| |
| +-----+-----------+ |
| | -1| NaN| |
| | 0| 0.0| |
| | 1| 1.0| |
| | 4| 2.0| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("sqrt", col) |
| |
| |
| @_try_remote_functions |
| def try_add(left: "ColumnOrName", right: "ColumnOrName") -> Column: |
| """ |
| Returns the sum of `left`and `right` and the result is null on overflow. |
| The acceptable input types are the same with the `+` operator. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or column name |
| right : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| Example 1: Integer plus Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(1982, 15), (1990, 2)], ["birth", "age"] |
| ... ).select("*", sf.try_add("birth", "age")).show() |
| +-----+---+-------------------+ |
| |birth|age|try_add(birth, age)| |
| +-----+---+-------------------+ |
| | 1982| 15| 1997| |
| | 1990| 2| 1992| |
| +-----+---+-------------------+ |
| |
| Example 2: Date plus Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (DATE('2015-09-30')) AS TAB(date)" |
| ... ).select("*", sf.try_add("date", sf.lit(1))).show() |
| +----------+----------------+ |
| | date|try_add(date, 1)| |
| +----------+----------------+ |
| |2015-09-30| 2015-10-01| |
| +----------+----------------+ |
| |
| Example 3: Date plus Interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (DATE('2015-09-30'), INTERVAL 1 YEAR) AS TAB(date, itvl)" |
| ... ).select("*", sf.try_add("date", "itvl")).show() |
| +----------+-----------------+-------------------+ |
| | date| itvl|try_add(date, itvl)| |
| +----------+-----------------+-------------------+ |
| |2015-09-30|INTERVAL '1' YEAR| 2016-09-30| |
| +----------+-----------------+-------------------+ |
| |
| Example 4: Interval plus Interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (INTERVAL 1 YEAR, INTERVAL 2 YEAR) AS TAB(itvl1, itvl2)" |
| ... ).select("*", sf.try_add("itvl1", "itvl2")).show() |
| +-----------------+-----------------+---------------------+ |
| | itvl1| itvl2|try_add(itvl1, itvl2)| |
| +-----------------+-----------------+---------------------+ |
| |INTERVAL '1' YEAR|INTERVAL '2' YEAR| INTERVAL '3' YEAR| |
| +-----------------+-----------------+---------------------+ |
| |
| Example 5: Overflow results in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... spark.range(1).select(sf.try_add(sf.lit(sys.maxsize), sf.lit(sys.maxsize))).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +-------------------------------------------------+ |
| |try_add(9223372036854775807, 9223372036854775807)| |
| +-------------------------------------------------+ |
| | NULL| |
| +-------------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("try_add", left, right) |
| |
| |
| @_try_remote_functions |
| def try_avg(col: "ColumnOrName") -> Column: |
| """ |
| Returns the mean calculated from values of a group and the result is null on overflow. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| Example 1: Calculating the average age |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) |
| >>> df.select(sf.try_avg("age")).show() |
| +------------+ |
| |try_avg(age)| |
| +------------+ |
| | 8.5| |
| +------------+ |
| |
| Example 2: Calculating the average age with None |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) |
| >>> df.select(sf.try_avg("age")).show() |
| +------------+ |
| |try_avg(age)| |
| +------------+ |
| | 3.0| |
| +------------+ |
| |
| Example 3: Overflow results in NULL when ANSI mode is on |
| |
| >>> from decimal import Decimal |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... df = spark.createDataFrame( |
| ... [(Decimal("1" * 38),), (Decimal(0),)], "number DECIMAL(38, 0)") |
| ... df.select(sf.try_avg(df.number)).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +---------------+ |
| |try_avg(number)| |
| +---------------+ |
| | NULL| |
| +---------------+ |
| """ |
| return _invoke_function_over_columns("try_avg", col) |
| |
| |
| @_try_remote_functions |
| def try_divide(left: "ColumnOrName", right: "ColumnOrName") -> Column: |
| """ |
| Returns `dividend`/`divisor`. It always performs floating point division. Its result is |
| always null if `divisor` is 0. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or column name |
| dividend |
| right : :class:`~pyspark.sql.Column` or column name |
| divisor |
| |
| Examples |
| -------- |
| Example 1: Integer divided by Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(6000, 15), (1990, 2), (1234, 0)], ["a", "b"] |
| ... ).select("*", sf.try_divide("a", "b")).show() |
| +----+---+----------------+ |
| | a| b|try_divide(a, b)| |
| +----+---+----------------+ |
| |6000| 15| 400.0| |
| |1990| 2| 995.0| |
| |1234| 0| NULL| |
| +----+---+----------------+ |
| |
| Example 2: Interval divided by Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(4).select(sf.make_interval(sf.lit(1)).alias("itvl"), "id") |
| >>> df.select("*", sf.try_divide("itvl", "id")).show() |
| +-------+---+--------------------+ |
| | itvl| id|try_divide(itvl, id)| |
| +-------+---+--------------------+ |
| |1 years| 0| NULL| |
| |1 years| 1| 1 years| |
| |1 years| 2| 6 months| |
| |1 years| 3| 4 months| |
| +-------+---+--------------------+ |
| |
| Example 3: Exception during division, resulting in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... spark.range(1).select(sf.try_divide("id", sf.lit(0))).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +-----------------+ |
| |try_divide(id, 0)| |
| +-----------------+ |
| | NULL| |
| +-----------------+ |
| """ |
| return _invoke_function_over_columns("try_divide", left, right) |
| |
| |
| @_try_remote_functions |
| def try_mod(left: "ColumnOrName", right: "ColumnOrName") -> Column: |
| """ |
| Returns the remainder after `dividend`/`divisor`. Its result is |
| always null if `divisor` is 0. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or column name |
| dividend |
| right : :class:`~pyspark.sql.Column` or column name |
| divisor |
| |
| Examples |
| -------- |
| Example 1: Integer divided by Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(6000, 15), (3, 2), (1234, 0)], ["a", "b"] |
| ... ).select("*", sf.try_mod("a", "b")).show() |
| +----+---+-------------+ |
| | a| b|try_mod(a, b)| |
| +----+---+-------------+ |
| |6000| 15| 0| |
| | 3| 2| 1| |
| |1234| 0| NULL| |
| +----+---+-------------+ |
| |
| Example 2: Exception during division, resulting in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... spark.range(1).select(sf.try_mod("id", sf.lit(0))).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +--------------+ |
| |try_mod(id, 0)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| """ |
| return _invoke_function_over_columns("try_mod", left, right) |
| |
| |
| @_try_remote_functions |
| def try_multiply(left: "ColumnOrName", right: "ColumnOrName") -> Column: |
| """ |
| Returns `left`*`right` and the result is null on overflow. The acceptable input types are the |
| same with the `*` operator. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or column name |
| multiplicand |
| right : :class:`~pyspark.sql.Column` or column name |
| multiplier |
| |
| Examples |
| -------- |
| Example 1: Integer multiplied by Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(6000, 15), (1990, 2)], ["a", "b"] |
| ... ).select("*", sf.try_multiply("a", "b")).show() |
| +----+---+------------------+ |
| | a| b|try_multiply(a, b)| |
| +----+---+------------------+ |
| |6000| 15| 90000| |
| |1990| 2| 3980| |
| +----+---+------------------+ |
| |
| Example 2: Interval multiplied by Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(6).select(sf.make_interval(sf.col("id"), sf.lit(3)).alias("itvl"), "id") |
| >>> df.select("*", sf.try_multiply("itvl", "id")).show() |
| +----------------+---+----------------------+ |
| | itvl| id|try_multiply(itvl, id)| |
| +----------------+---+----------------------+ |
| | 3 months| 0| 0 seconds| |
| |1 years 3 months| 1| 1 years 3 months| |
| |2 years 3 months| 2| 4 years 6 months| |
| |3 years 3 months| 3| 9 years 9 months| |
| |4 years 3 months| 4| 17 years| |
| |5 years 3 months| 5| 26 years 3 months| |
| +----------------+---+----------------------+ |
| |
| Example 3: Overflow results in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... spark.range(1).select(sf.try_multiply(sf.lit(sys.maxsize), sf.lit(sys.maxsize))).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +------------------------------------------------------+ |
| |try_multiply(9223372036854775807, 9223372036854775807)| |
| +------------------------------------------------------+ |
| | NULL| |
| +------------------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("try_multiply", left, right) |
| |
| |
| @_try_remote_functions |
| def try_subtract(left: "ColumnOrName", right: "ColumnOrName") -> Column: |
| """ |
| Returns `left`-`right` and the result is null on overflow. The acceptable input types are the |
| same with the `-` operator. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or column name |
| right : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| Example 1: Integer minus Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(1982, 15), (1990, 2)], ["birth", "age"] |
| ... ).select("*", sf.try_subtract("birth", "age")).show() |
| +-----+---+------------------------+ |
| |birth|age|try_subtract(birth, age)| |
| +-----+---+------------------------+ |
| | 1982| 15| 1967| |
| | 1990| 2| 1988| |
| +-----+---+------------------------+ |
| |
| Example 2: Date minus Integer. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (DATE('2015-10-01')) AS TAB(date)" |
| ... ).select("*", sf.try_subtract("date", sf.lit(1))).show() |
| +----------+---------------------+ |
| | date|try_subtract(date, 1)| |
| +----------+---------------------+ |
| |2015-10-01| 2015-09-30| |
| +----------+---------------------+ |
| |
| Example 3: Date minus Interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (DATE('2015-09-30'), INTERVAL 1 YEAR) AS TAB(date, itvl)" |
| ... ).select("*", sf.try_subtract("date", "itvl")).show() |
| +----------+-----------------+------------------------+ |
| | date| itvl|try_subtract(date, itvl)| |
| +----------+-----------------+------------------------+ |
| |2015-09-30|INTERVAL '1' YEAR| 2014-09-30| |
| +----------+-----------------+------------------------+ |
| |
| Example 4: Interval minus Interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (INTERVAL 1 YEAR, INTERVAL 2 YEAR) AS TAB(itvl1, itvl2)" |
| ... ).select("*", sf.try_subtract("itvl1", "itvl2")).show() |
| +-----------------+-----------------+--------------------------+ |
| | itvl1| itvl2|try_subtract(itvl1, itvl2)| |
| +-----------------+-----------------+--------------------------+ |
| |INTERVAL '1' YEAR|INTERVAL '2' YEAR| INTERVAL '-1' YEAR| |
| +-----------------+-----------------+--------------------------+ |
| |
| Example 5: Overflow results in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... spark.range(1).select(sf.try_subtract(sf.lit(-sys.maxsize), sf.lit(sys.maxsize))).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +-------------------------------------------------------+ |
| |try_subtract(-9223372036854775807, 9223372036854775807)| |
| +-------------------------------------------------------+ |
| | NULL| |
| +-------------------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("try_subtract", left, right) |
| |
| |
| @_try_remote_functions |
| def try_sum(col: "ColumnOrName") -> Column: |
| """ |
| Returns the sum calculated from values of a group and the result is null on overflow. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| Example 1: Calculating the sum of values in a column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(10).select(sf.try_sum("id")).show() |
| +-----------+ |
| |try_sum(id)| |
| +-----------+ |
| | 45| |
| +-----------+ |
| |
| Example 2: Using a plus expression together to calculate the sum |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 2), (3, 4)], ["A", "B"]) |
| >>> df.select(sf.try_sum(sf.col("A") + sf.col("B"))).show() |
| +----------------+ |
| |try_sum((A + B))| |
| +----------------+ |
| | 10| |
| +----------------+ |
| |
| Example 3: Calculating the summation of ages with None |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) |
| >>> df.select(sf.try_sum("age")).show() |
| +------------+ |
| |try_sum(age)| |
| +------------+ |
| | 6| |
| +------------+ |
| |
| Example 4: Overflow results in NULL when ANSI mode is on |
| |
| >>> from decimal import Decimal |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... df = spark.createDataFrame([(Decimal("1" * 38),)] * 10, "number DECIMAL(38, 0)") |
| ... df.select(sf.try_sum(df.number)).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +---------------+ |
| |try_sum(number)| |
| +---------------+ |
| | NULL| |
| +---------------+ |
| """ |
| return _invoke_function_over_columns("try_sum", col) |
| |
| |
| @_try_remote_functions |
| def abs(col: "ColumnOrName") -> Column: |
| """ |
| Mathematical Function: Computes the absolute value of the given column or expression. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or expression to compute the absolute value on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column object representing the absolute value of the input. |
| |
| Examples |
| -------- |
| Example 1: Compute the absolute value of a long column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-1,), (-2,), (-3,), (None,)], ["value"]) |
| >>> df.select("*", sf.abs(df.value)).show() |
| +-----+----------+ |
| |value|abs(value)| |
| +-----+----------+ |
| | -1| 1| |
| | -2| 2| |
| | -3| 3| |
| | NULL| NULL| |
| +-----+----------+ |
| |
| Example 2: Compute the absolute value of a double column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-1.5,), (-2.5,), (None,), (float("nan"),)], ["value"]) |
| >>> df.select("*", sf.abs(df.value)).show() |
| +-----+----------+ |
| |value|abs(value)| |
| +-----+----------+ |
| | -1.5| 1.5| |
| | -2.5| 2.5| |
| | NULL| NULL| |
| | NaN| NaN| |
| +-----+----------+ |
| |
| Example 3: Compute the absolute value of an expression |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 1), (2, -2), (3, 3)], ["id", "value"]) |
| >>> df.select("*", sf.abs(df.id - df.value)).show() |
| +---+-----+-----------------+ |
| | id|value|abs((id - value))| |
| +---+-----+-----------------+ |
| | 1| 1| 0| |
| | 2| -2| 4| |
| | 3| 3| 0| |
| +---+-----+-----------------+ |
| """ |
| return _invoke_function_over_columns("abs", col) |
| |
| |
| @_try_remote_functions |
| def mode(col: "ColumnOrName", deterministic: bool = False) -> Column: |
| """ |
| Returns the most frequent value in a group. |
| |
| .. versionadded:: 3.4.0 |
| |
| .. versionchanged:: 4.0.0 |
| Supports deterministic argument. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| deterministic : bool, optional |
| if there are multiple equally-frequent results then return the lowest (defaults to false). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the most frequent value in a group. |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), |
| ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), |
| ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)], |
| ... schema=("course", "year", "earnings")) |
| >>> df.groupby("course").agg(sf.mode("year")).sort("course").show() |
| +------+----------+ |
| |course|mode(year)| |
| +------+----------+ |
| | Java| 2012| |
| |dotNET| 2012| |
| +------+----------+ |
| |
| When multiple values have the same greatest frequency then either any of values is returned if |
| deterministic is false or is not defined, or the lowest value is returned if deterministic is |
| true. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-10,), (0,), (10,)], ["col"]) |
| >>> df.select(sf.mode("col", False)).show() # doctest: +SKIP |
| +---------+ |
| |mode(col)| |
| +---------+ |
| | 0| |
| +---------+ |
| |
| >>> df.select(sf.mode("col", True)).show() |
| +---------------------------------------+ |
| |mode() WITHIN GROUP (ORDER BY col DESC)| |
| +---------------------------------------+ |
| | -10| |
| +---------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("mode", _to_java_column(col), _enum_to_value(deterministic)) |
| |
| |
| @_try_remote_functions |
| def max(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the maximum value of the expression in a group. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column on which the maximum value is computed. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column that contains the maximum value computed. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.min` |
| :meth:`pyspark.sql.functions.avg` |
| :meth:`pyspark.sql.functions.sum` |
| |
| Notes |
| ----- |
| - Null values are ignored during the computation. |
| - NaN values are larger than any other numeric value. |
| |
| Examples |
| -------- |
| Example 1: Compute the maximum value of a numeric column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(10) |
| >>> df.select(sf.max(df.id)).show() |
| +-------+ |
| |max(id)| |
| +-------+ |
| | 9| |
| +-------+ |
| |
| Example 2: Compute the maximum value of a string column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("A",), ("B",), ("C",)], ["value"]) |
| >>> df.select(sf.max(df.value)).show() |
| +----------+ |
| |max(value)| |
| +----------+ |
| | C| |
| +----------+ |
| |
| Example 3: Compute the maximum value of a column in a grouped DataFrame |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("A", 1), ("A", 2), ("B", 3), ("B", 4)], ["key", "value"]) |
| >>> df.groupBy("key").agg(sf.max(df.value)).show() |
| +---+----------+ |
| |key|max(value)| |
| +---+----------+ |
| | A| 2| |
| | B| 4| |
| +---+----------+ |
| |
| Example 4: Compute the maximum value of multiple columns in a grouped DataFrame |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("A", 1, 2), ("A", 2, 3), ("B", 3, 4), ("B", 4, 5)], ["key", "value1", "value2"]) |
| >>> df.groupBy("key").agg(sf.max("value1"), sf.max("value2")).show() |
| +---+-----------+-----------+ |
| |key|max(value1)|max(value2)| |
| +---+-----------+-----------+ |
| | A| 2| 3| |
| | B| 4| 5| |
| +---+-----------+-----------+ |
| |
| Example 5: Compute the maximum value of a column with null values |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1,), (2,), (None,)], ["value"]) |
| >>> df.select(sf.max(df.value)).show() |
| +----------+ |
| |max(value)| |
| +----------+ |
| | 2| |
| +----------+ |
| |
| Example 6: Compute the maximum value of a column with "NaN" values |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1.1,), (float("nan"),), (3.3,)], ["value"]) |
| >>> df.select(sf.max(df.value)).show() |
| +----------+ |
| |max(value)| |
| +----------+ |
| | NaN| |
| +----------+ |
| """ |
| return _invoke_function_over_columns("max", col) |
| |
| |
| @_try_remote_functions |
| def min(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the minimum value of the expression in a group. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column on which the minimum value is computed. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column that contains the minimum value computed. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.max` |
| :meth:`pyspark.sql.functions.avg` |
| :meth:`pyspark.sql.functions.sum` |
| |
| Examples |
| -------- |
| Example 1: Compute the minimum value of a numeric column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(10) |
| >>> df.select(sf.min(df.id)).show() |
| +-------+ |
| |min(id)| |
| +-------+ |
| | 0| |
| +-------+ |
| |
| Example 2: Compute the minimum value of a string column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("Alice",), ("Bob",), ("Charlie",)], ["name"]) |
| >>> df.select(sf.min("name")).show() |
| +---------+ |
| |min(name)| |
| +---------+ |
| | Alice| |
| +---------+ |
| |
| Example 3: Compute the minimum value of a column with null values |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1,), (None,), (3,)], ["value"]) |
| >>> df.select(sf.min("value")).show() |
| +----------+ |
| |min(value)| |
| +----------+ |
| | 1| |
| +----------+ |
| |
| Example 4: Compute the minimum value of a column in a grouped DataFrame |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("Alice", 1), ("Alice", 2), ("Bob", 3)], ["name", "value"]) |
| >>> df.groupBy("name").agg(sf.min("value")).show() |
| +-----+----------+ |
| | name|min(value)| |
| +-----+----------+ |
| |Alice| 1| |
| | Bob| 3| |
| +-----+----------+ |
| |
| Example 5: Compute the minimum value of a column in a DataFrame with multiple columns |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("Alice", 1, 100), ("Bob", 2, 200), ("Charlie", 3, 300)], |
| ... ["name", "value1", "value2"]) |
| >>> df.select(sf.min("value1"), sf.min("value2")).show() |
| +-----------+-----------+ |
| |min(value1)|min(value2)| |
| +-----------+-----------+ |
| | 1| 100| |
| +-----------+-----------+ |
| """ |
| return _invoke_function_over_columns("min", col) |
| |
| |
| @_try_remote_functions |
| def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: |
| """ |
| Returns the value from the `col` parameter that is associated with the maximum value |
| from the `ord` parameter. This function is often used to find the `col` parameter value |
| corresponding to the maximum `ord` parameter value within each group when used with groupBy(). |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic so the output order can be different for those |
| associated the same values of `col`. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The column representing the values to be returned. This could be the column instance |
| or the column name as string. |
| ord : :class:`~pyspark.sql.Column` or column name |
| The column that needs to be maximized. This could be the column instance |
| or the column name as string. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column object representing the value from `col` that is associated with |
| the maximum value from `ord`. |
| |
| Examples |
| -------- |
| Example 1: Using `max_by` with groupBy |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), |
| ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)], |
| ... schema=("course", "year", "earnings")) |
| >>> df.groupby("course").agg(sf.max_by("year", "earnings")).sort("course").show() |
| +------+----------------------+ |
| |course|max_by(year, earnings)| |
| +------+----------------------+ |
| | Java| 2013| |
| |dotNET| 2013| |
| +------+----------------------+ |
| |
| Example 2: Using `max_by` with different data types |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Marketing", "Anna", 4), ("IT", "Bob", 2), |
| ... ("IT", "Charlie", 3), ("Marketing", "David", 1)], |
| ... schema=("department", "name", "years_in_dept")) |
| >>> df.groupby("department").agg( |
| ... sf.max_by("name", "years_in_dept") |
| ... ).sort("department").show() |
| +----------+---------------------------+ |
| |department|max_by(name, years_in_dept)| |
| +----------+---------------------------+ |
| | IT| Charlie| |
| | Marketing| Anna| |
| +----------+---------------------------+ |
| |
| Example 3: Using `max_by` where `ord` has multiple maximum values |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Consult", "Eva", 6), ("Finance", "Frank", 5), |
| ... ("Finance", "George", 9), ("Consult", "Henry", 7)], |
| ... schema=("department", "name", "years_in_dept")) |
| >>> df.groupby("department").agg( |
| ... sf.max_by("name", "years_in_dept") |
| ... ).sort("department").show() |
| +----------+---------------------------+ |
| |department|max_by(name, years_in_dept)| |
| +----------+---------------------------+ |
| | Consult| Henry| |
| | Finance| George| |
| +----------+---------------------------+ |
| """ |
| return _invoke_function_over_columns("max_by", col, ord) |
| |
| |
| @_try_remote_functions |
| def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column: |
| """ |
| Returns the value from the `col` parameter that is associated with the minimum value |
| from the `ord` parameter. This function is often used to find the `col` parameter value |
| corresponding to the minimum `ord` parameter value within each group when used with groupBy(). |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic so the output order can be different for those |
| associated the same values of `col`. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The column representing the values that will be returned. This could be the column instance |
| or the column name as string. |
| ord : :class:`~pyspark.sql.Column` or column name |
| The column that needs to be minimized. This could be the column instance |
| or the column name as string. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Column object that represents the value from `col` associated with |
| the minimum value from `ord`. |
| |
| Examples |
| -------- |
| Example 1: Using `min_by` with groupBy: |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), |
| ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)], |
| ... schema=("course", "year", "earnings")) |
| >>> df.groupby("course").agg(sf.min_by("year", "earnings")).sort("course").show() |
| +------+----------------------+ |
| |course|min_by(year, earnings)| |
| +------+----------------------+ |
| | Java| 2012| |
| |dotNET| 2012| |
| +------+----------------------+ |
| |
| Example 2: Using `min_by` with different data types: |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Marketing", "Anna", 4), ("IT", "Bob", 2), |
| ... ("IT", "Charlie", 3), ("Marketing", "David", 1)], |
| ... schema=("department", "name", "years_in_dept")) |
| >>> df.groupby("department").agg( |
| ... sf.min_by("name", "years_in_dept") |
| ... ).sort("department").show() |
| +----------+---------------------------+ |
| |department|min_by(name, years_in_dept)| |
| +----------+---------------------------+ |
| | IT| Bob| |
| | Marketing| David| |
| +----------+---------------------------+ |
| |
| Example 3: Using `min_by` where `ord` has multiple minimum values: |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Consult", "Eva", 6), ("Finance", "Frank", 5), |
| ... ("Finance", "George", 9), ("Consult", "Henry", 7)], |
| ... schema=("department", "name", "years_in_dept")) |
| >>> df.groupby("department").agg( |
| ... sf.min_by("name", "years_in_dept") |
| ... ).sort("department").show() |
| +----------+---------------------------+ |
| |department|min_by(name, years_in_dept)| |
| +----------+---------------------------+ |
| | Consult| Eva| |
| | Finance| Frank| |
| +----------+---------------------------+ |
| """ |
| return _invoke_function_over_columns("min_by", col, ord) |
| |
| |
| @_try_remote_functions |
| def count(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the number of items in a group. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.count_if` |
| |
| Examples |
| -------- |
| Example 1: Count all rows in a DataFrame |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None,), ("a",), ("b",), ("c",)], schema=["alphabets"]) |
| >>> df.select(sf.count(sf.expr("*"))).show() |
| +--------+ |
| |count(1)| |
| +--------+ |
| | 4| |
| +--------+ |
| |
| Example 2: Count non-null values in a specific column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df.select(sf.count(df.alphabets)).show() |
| +----------------+ |
| |count(alphabets)| |
| +----------------+ |
| | 3| |
| +----------------+ |
| |
| Example 3: Count all rows in a DataFrame with multiple columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(1, "apple"), (2, "banana"), (3, None)], schema=["id", "fruit"]) |
| >>> df.select(sf.count(sf.expr("*"))).show() |
| +--------+ |
| |count(1)| |
| +--------+ |
| | 3| |
| +--------+ |
| |
| Example 4: Count non-null values in multiple columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df.select(sf.count(df.id), sf.count(df.fruit)).show() |
| +---------+------------+ |
| |count(id)|count(fruit)| |
| +---------+------------+ |
| | 3| 2| |
| +---------+------------+ |
| """ |
| return _invoke_function_over_columns("count", col) |
| |
| |
| @_try_remote_functions |
| def sum(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the sum of all values in the expression. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Calculating the sum of values in a column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(10) |
| >>> df.select(sf.sum(df["id"])).show() |
| +-------+ |
| |sum(id)| |
| +-------+ |
| | 45| |
| +-------+ |
| |
| Example 2: Using a plus expression together to calculate the sum |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 2), (3, 4)], ["A", "B"]) |
| >>> df.select(sf.sum(sf.col("A") + sf.col("B"))).show() |
| +------------+ |
| |sum((A + B))| |
| +------------+ |
| | 10| |
| +------------+ |
| |
| Example 3: Calculating the summation of ages with None |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) |
| >>> df.select(sf.sum("age")).show() |
| +--------+ |
| |sum(age)| |
| +--------+ |
| | 6| |
| +--------+ |
| """ |
| return _invoke_function_over_columns("sum", col) |
| |
| |
| @_try_remote_functions |
| def avg(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the average of the values in a group. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Calculating the average age |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) |
| >>> df.select(sf.avg("age")).show() |
| +--------+ |
| |avg(age)| |
| +--------+ |
| | 8.5| |
| +--------+ |
| |
| Example 2: Calculating the average age with None |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) |
| >>> df.select(sf.avg("age")).show() |
| +--------+ |
| |avg(age)| |
| +--------+ |
| | 3.0| |
| +--------+ |
| """ |
| return _invoke_function_over_columns("avg", col) |
| |
| |
| @_try_remote_functions |
| def mean(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the average of the values in a group. |
| An alias of :func:`avg`. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Calculating the average age |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) |
| >>> df.select(sf.mean("age")).show() |
| +--------+ |
| |avg(age)| |
| +--------+ |
| | 8.5| |
| +--------+ |
| |
| Example 2: Calculating the average age with None |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) |
| >>> df.select(sf.mean("age")).show() |
| +--------+ |
| |avg(age)| |
| +--------+ |
| | 3.0| |
| +--------+ |
| """ |
| return _invoke_function_over_columns("mean", col) |
| |
| |
| @_try_remote_functions |
| def median(col: "ColumnOrName") -> Column: |
| """ |
| Returns the median of the values in a group. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the median of the values in a group. |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| :meth:`pyspark.sql.functions.percentile` |
| :meth:`pyspark.sql.functions.approx_percentile` |
| :meth:`pyspark.sql.functions.percentile_approx` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("Java", 2012, 20000), ("dotNET", 2012, 5000), |
| ... ("Java", 2012, 22000), ("dotNET", 2012, 10000), |
| ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)], |
| ... schema=("course", "year", "earnings")) |
| >>> df.groupby("course").agg(sf.median("earnings")).show() |
| +------+----------------+ |
| |course|median(earnings)| |
| +------+----------------+ |
| | Java| 22000.0| |
| |dotNET| 10000.0| |
| +------+----------------+ |
| """ |
| return _invoke_function_over_columns("median", col) |
| |
| |
| @_try_remote_functions |
| def sumDistinct(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the sum of distinct values in the expression. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 3.2.0 |
| Use :func:`sum_distinct` instead. |
| """ |
| warnings.warn("Deprecated in 3.2, use sum_distinct instead.", FutureWarning) |
| return sum_distinct(col) |
| |
| |
| @_try_remote_functions |
| def sum_distinct(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the sum of distinct values in the expression. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Using sum_distinct function on a column with all distinct values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (2,), (3,), (4,)], ["numbers"]) |
| >>> df.select(sf.sum_distinct('numbers')).show() |
| +---------------------+ |
| |sum(DISTINCT numbers)| |
| +---------------------+ |
| | 10| |
| +---------------------+ |
| |
| Example 2: Using sum_distinct function on a column with no distinct values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (1,), (1,), (1,)], ["numbers"]) |
| >>> df.select(sf.sum_distinct('numbers')).show() |
| +---------------------+ |
| |sum(DISTINCT numbers)| |
| +---------------------+ |
| | 1| |
| +---------------------+ |
| |
| Example 3: Using sum_distinct function on a column with null and duplicate values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None,), (1,), (1,), (2,)], ["numbers"]) |
| >>> df.select(sf.sum_distinct('numbers')).show() |
| +---------------------+ |
| |sum(DISTINCT numbers)| |
| +---------------------+ |
| | 3| |
| +---------------------+ |
| |
| Example 4: Using sum_distinct function on a column with all None values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, IntegerType |
| >>> schema = StructType([StructField("numbers", IntegerType(), True)]) |
| >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) |
| >>> df.select(sf.sum_distinct('numbers')).show() |
| +---------------------+ |
| |sum(DISTINCT numbers)| |
| +---------------------+ |
| | NULL| |
| +---------------------+ |
| """ |
| return _invoke_function_over_columns("sum_distinct", col) |
| |
| |
| @_try_remote_functions |
| def listagg(col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None) -> Column: |
| """ |
| Aggregate function: returns the concatenation of non-null input values, |
| separated by the delimiter. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional |
| the delimiter to separate the values. The default value is None. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Using listagg function |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) |
| >>> df.select(sf.listagg('strings')).show() |
| +----------------------+ |
| |listagg(strings, NULL)| |
| +----------------------+ |
| | abc| |
| +----------------------+ |
| |
| Example 2: Using listagg function with a delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) |
| >>> df.select(sf.listagg('strings', ', ')).show() |
| +--------------------+ |
| |listagg(strings, , )| |
| +--------------------+ |
| | a, b, c| |
| +--------------------+ |
| |
| Example 3: Using listagg function with a binary column and delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',)], ['bytes']) |
| >>> df.select(sf.listagg('bytes', b'\x42')).show() |
| +---------------------+ |
| |listagg(bytes, X'42')| |
| +---------------------+ |
| | [01 42 02 42 03]| |
| +---------------------+ |
| |
| Example 4: Using listagg function on a column with all None values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, StringType |
| >>> schema = StructType([StructField("strings", StringType(), True)]) |
| >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) |
| >>> df.select(sf.listagg('strings')).show() |
| +----------------------+ |
| |listagg(strings, NULL)| |
| +----------------------+ |
| | NULL| |
| +----------------------+ |
| """ |
| if delimiter is None: |
| return _invoke_function_over_columns("listagg", col) |
| else: |
| return _invoke_function_over_columns("listagg", col, lit(delimiter)) |
| |
| |
| @_try_remote_functions |
| def listagg_distinct( |
| col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None |
| ) -> Column: |
| """ |
| Aggregate function: returns the concatenation of distinct non-null input values, |
| separated by the delimiter. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional |
| the delimiter to separate the values. The default value is None. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Using listagg_distinct function |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) |
| >>> df.select(sf.listagg_distinct('strings')).show() |
| +-------------------------------+ |
| |listagg(DISTINCT strings, NULL)| |
| +-------------------------------+ |
| | abc| |
| +-------------------------------+ |
| |
| Example 2: Using listagg_distinct function with a delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) |
| >>> df.select(sf.listagg_distinct('strings', ', ')).show() |
| +-----------------------------+ |
| |listagg(DISTINCT strings, , )| |
| +-----------------------------+ |
| | a, b, c| |
| +-----------------------------+ |
| |
| Example 3: Using listagg_distinct function with a binary column and delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',), (b'\x02',)], |
| ... ['bytes']) |
| >>> df.select(sf.listagg_distinct('bytes', b'\x42')).show() |
| +------------------------------+ |
| |listagg(DISTINCT bytes, X'42')| |
| +------------------------------+ |
| | [01 42 02 42 03]| |
| +------------------------------+ |
| |
| Example 4: Using listagg_distinct function on a column with all None values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, StringType |
| >>> schema = StructType([StructField("strings", StringType(), True)]) |
| >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) |
| >>> df.select(sf.listagg_distinct('strings')).show() |
| +-------------------------------+ |
| |listagg(DISTINCT strings, NULL)| |
| +-------------------------------+ |
| | NULL| |
| +-------------------------------+ |
| """ |
| if delimiter is None: |
| return _invoke_function_over_columns("listagg_distinct", col) |
| else: |
| return _invoke_function_over_columns("listagg_distinct", col, lit(delimiter)) |
| |
| |
| @_try_remote_functions |
| def string_agg( |
| col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None |
| ) -> Column: |
| """ |
| Aggregate function: returns the concatenation of non-null input values, |
| separated by the delimiter. |
| |
| An alias of :func:`listagg`. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional |
| the delimiter to separate the values. The default value is None. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Using string_agg function |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) |
| >>> df.select(sf.string_agg('strings')).show() |
| +-------------------------+ |
| |string_agg(strings, NULL)| |
| +-------------------------+ |
| | abc| |
| +-------------------------+ |
| |
| Example 2: Using string_agg function with a delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',)], ['strings']) |
| >>> df.select(sf.string_agg('strings', ', ')).show() |
| +-----------------------+ |
| |string_agg(strings, , )| |
| +-----------------------+ |
| | a, b, c| |
| +-----------------------+ |
| |
| Example 3: Using string_agg function with a binary column and delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',)], ['bytes']) |
| >>> df.select(sf.string_agg('bytes', b'\x42')).show() |
| +------------------------+ |
| |string_agg(bytes, X'42')| |
| +------------------------+ |
| | [01 42 02 42 03]| |
| +------------------------+ |
| |
| Example 4: Using string_agg function on a column with all None values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, StringType |
| >>> schema = StructType([StructField("strings", StringType(), True)]) |
| >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) |
| >>> df.select(sf.string_agg('strings')).show() |
| +-------------------------+ |
| |string_agg(strings, NULL)| |
| +-------------------------+ |
| | NULL| |
| +-------------------------+ |
| """ |
| if delimiter is None: |
| return _invoke_function_over_columns("string_agg", col) |
| else: |
| return _invoke_function_over_columns("string_agg", col, lit(delimiter)) |
| |
| |
| @_try_remote_functions |
| def string_agg_distinct( |
| col: "ColumnOrName", delimiter: Optional[Union[Column, str, bytes]] = None |
| ) -> Column: |
| """ |
| Aggregate function: returns the concatenation of distinct non-null input values, |
| separated by the delimiter. |
| |
| An alias of :func:`listagg_distinct`. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| delimiter : :class:`~pyspark.sql.Column`, literal string or bytes, optional |
| the delimiter to separate the values. The default value is None. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Using string_agg_distinct function |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) |
| >>> df.select(sf.string_agg_distinct('strings')).show() |
| +----------------------------------+ |
| |string_agg(DISTINCT strings, NULL)| |
| +----------------------------------+ |
| | abc| |
| +----------------------------------+ |
| |
| Example 2: Using string_agg_distinct function with a delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a',), ('b',), (None,), ('c',), ('b',)], ['strings']) |
| >>> df.select(sf.string_agg_distinct('strings', ', ')).show() |
| +--------------------------------+ |
| |string_agg(DISTINCT strings, , )| |
| +--------------------------------+ |
| | a, b, c| |
| +--------------------------------+ |
| |
| Example 3: Using string_agg_distinct function with a binary column and delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(b'\x01',), (b'\x02',), (None,), (b'\x03',), (b'\x02',)], |
| ... ['bytes']) |
| >>> df.select(sf.string_agg_distinct('bytes', b'\x42')).show() |
| +---------------------------------+ |
| |string_agg(DISTINCT bytes, X'42')| |
| +---------------------------------+ |
| | [01 42 02 42 03]| |
| +---------------------------------+ |
| |
| Example 4: Using string_agg_distinct function on a column with all None values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, StringType |
| >>> schema = StructType([StructField("strings", StringType(), True)]) |
| >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)], schema=schema) |
| >>> df.select(sf.string_agg_distinct('strings')).show() |
| +----------------------------------+ |
| |string_agg(DISTINCT strings, NULL)| |
| +----------------------------------+ |
| | NULL| |
| +----------------------------------+ |
| """ |
| if delimiter is None: |
| return _invoke_function_over_columns("string_agg_distinct", col) |
| else: |
| return _invoke_function_over_columns("string_agg_distinct", col, lit(delimiter)) |
| |
| |
| @_try_remote_functions |
| def product(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the product of the values in a group. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column containing values to be multiplied together |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` or column name |
| the column for computed results. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT id % 3 AS mod3, id AS value FROM RANGE(10)") |
| >>> df.groupBy('mod3').agg(sf.product('value')).orderBy('mod3').show() |
| +----+--------------+ |
| |mod3|product(value)| |
| +----+--------------+ |
| | 0| 0.0| |
| | 1| 28.0| |
| | 2| 80.0| |
| +----+--------------+ |
| """ |
| return _invoke_function_over_columns("product", col) |
| |
| |
| @_try_remote_functions |
| def acos(col: "ColumnOrName") -> Column: |
| """ |
| Mathematical Function: Computes the inverse cosine (also known as arccosine) |
| of the given column or expression. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or expression to compute the inverse cosine on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column object representing the inverse cosine of the input. |
| |
| Examples |
| -------- |
| Example 1: Compute the inverse cosine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-1.0,), (-0.5,), (0.0,), (0.5,), (1.0,)], ["value"]) |
| >>> df.select("*", sf.acos("value")).show() |
| +-----+------------------+ |
| |value| ACOS(value)| |
| +-----+------------------+ |
| | -1.0| 3.141592653589...| |
| | -0.5|2.0943951023931...| |
| | 0.0|1.5707963267948...| |
| | 0.5|1.0471975511965...| |
| | 1.0| 0.0| |
| +-----+------------------+ |
| |
| Example 2: Compute the inverse cosine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-2), (2), (NULL) AS TAB(value)" |
| ... ).select("*", sf.acos("value")).show() |
| +-----+-----------+ |
| |value|ACOS(value)| |
| +-----+-----------+ |
| | -2| NaN| |
| | 2| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("acos", col) |
| |
| |
| @_try_remote_functions |
| def acosh(col: "ColumnOrName") -> Column: |
| """ |
| Mathematical Function: Computes the inverse hyperbolic cosine (also known as arcosh) |
| of the given column or expression. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or expression to compute the inverse hyperbolic cosine on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column object representing the inverse hyperbolic cosine of the input. |
| |
| Examples |
| -------- |
| Example 1: Compute the inverse hyperbolic cosine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (2,)], ["value"]) |
| >>> df.select("*", sf.acosh(df.value)).show() |
| +-----+------------------+ |
| |value| ACOSH(value)| |
| +-----+------------------+ |
| | 1| 0.0| |
| | 2|1.3169578969248...| |
| +-----+------------------+ |
| |
| Example 2: Compute the inverse hyperbolic cosine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-0.5), (0.5), (NULL) AS TAB(value)" |
| ... ).select("*", sf.acosh("value")).show() |
| +-----+------------+ |
| |value|ACOSH(value)| |
| +-----+------------+ |
| | -0.5| NaN| |
| | 0.5| NaN| |
| | NULL| NULL| |
| +-----+------------+ |
| """ |
| return _invoke_function_over_columns("acosh", col) |
| |
| |
| @_try_remote_functions |
| def asin(col: "ColumnOrName") -> Column: |
| """ |
| Computes inverse sine of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| inverse sine of `col`, as if computed by `java.lang.Math.asin()` |
| |
| Examples |
| -------- |
| Example 1: Compute the inverse sine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-0.5,), (0.0,), (0.5,)], ["value"]) |
| >>> df.select("*", sf.asin(df.value)).show() |
| +-----+-------------------+ |
| |value| ASIN(value)| |
| +-----+-------------------+ |
| | -0.5|-0.5235987755982...| |
| | 0.0| 0.0| |
| | 0.5| 0.5235987755982...| |
| +-----+-------------------+ |
| |
| Example 2: Compute the inverse sine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-2), (2), (NULL) AS TAB(value)" |
| ... ).select("*", sf.asin("value")).show() |
| +-----+-----------+ |
| |value|ASIN(value)| |
| +-----+-----------+ |
| | -2| NaN| |
| | 2| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("asin", col) |
| |
| |
| @_try_remote_functions |
| def asinh(col: "ColumnOrName") -> Column: |
| """ |
| Computes inverse hyperbolic sine of the input column. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Compute the inverse hyperbolic sine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-0.5,), (0.0,), (0.5,)], ["value"]) |
| >>> df.select("*", sf.asinh(df.value)).show() |
| +-----+--------------------+ |
| |value| ASINH(value)| |
| +-----+--------------------+ |
| | -0.5|-0.48121182505960...| |
| | 0.0| 0.0| |
| | 0.5| 0.48121182505960...| |
| +-----+--------------------+ |
| |
| Example 2: Compute the inverse hyperbolic sine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.asinh("value")).show() |
| +-----+------------+ |
| |value|ASINH(value)| |
| +-----+------------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+------------+ |
| """ |
| return _invoke_function_over_columns("asinh", col) |
| |
| |
| @_try_remote_functions |
| def atan(col: "ColumnOrName") -> Column: |
| """ |
| Compute inverse tangent of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| inverse tangent of `col`, as if computed by `java.lang.Math.atan()` |
| |
| Examples |
| -------- |
| Example 1: Compute the inverse tangent |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-0.5,), (0.0,), (0.5,)], ["value"]) |
| >>> df.select("*", sf.atan(df.value)).show() |
| +-----+-------------------+ |
| |value| ATAN(value)| |
| +-----+-------------------+ |
| | -0.5|-0.4636476090008...| |
| | 0.0| 0.0| |
| | 0.5| 0.4636476090008...| |
| +-----+-------------------+ |
| |
| Example 2: Compute the inverse tangent of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.atan("value")).show() |
| +-----+-----------+ |
| |value|ATAN(value)| |
| +-----+-----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("atan", col) |
| |
| |
| @_try_remote_functions |
| def atanh(col: "ColumnOrName") -> Column: |
| """ |
| Computes inverse hyperbolic tangent of the input column. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Compute the inverse hyperbolic tangent |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-0.5,), (0.0,), (0.5,)], ["value"]) |
| >>> df.select("*", sf.atanh(df.value)).show() |
| +-----+-------------------+ |
| |value| ATANH(value)| |
| +-----+-------------------+ |
| | -0.5|-0.5493061443340...| |
| | 0.0| 0.0| |
| | 0.5| 0.5493061443340...| |
| +-----+-------------------+ |
| |
| Example 2: Compute the inverse hyperbolic tangent of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-2), (2), (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.atanh("value")).show() |
| +-----+------------+ |
| |value|ATANH(value)| |
| +-----+------------+ |
| | -2.0| NaN| |
| | 2.0| NaN| |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+------------+ |
| """ |
| return _invoke_function_over_columns("atanh", col) |
| |
| |
| @_try_remote_functions |
| def cbrt(col: "ColumnOrName") -> Column: |
| """ |
| Computes the cube-root of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| Example 1: Compute the cube-root |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-8,), (0,), (8,)], ["value"]) |
| >>> df.select("*", sf.cbrt(df.value)).show() |
| +-----+-----------+ |
| |value|CBRT(value)| |
| +-----+-----------+ |
| | -8| -2.0| |
| | 0| 0.0| |
| | 8| 2.0| |
| +-----+-----------+ |
| |
| Example 2: Compute the cube-root of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.cbrt("value")).show() |
| +-----+-----------+ |
| |value|CBRT(value)| |
| +-----+-----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("cbrt", col) |
| |
| |
| @_try_remote_functions |
| def ceil(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: |
| """ |
| Computes the ceiling of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or column name to compute the ceiling on. |
| scale : :class:`~pyspark.sql.Column` or int, optional |
| An optional parameter to control the rounding behavior. |
| |
| .. versionadded:: 4.0.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column for the computed results. |
| |
| Examples |
| -------- |
| Example 1: Compute the ceiling of a column value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.ceil(sf.lit(-0.1))).show() |
| +----------+ |
| |CEIL(-0.1)| |
| +----------+ |
| | 0| |
| +----------+ |
| |
| Example 2: Compute the ceiling of a column value with a specified scale |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.ceil(sf.lit(-0.1), 1)).show() |
| +-------------+ |
| |ceil(-0.1, 1)| |
| +-------------+ |
| | -0.1| |
| +-------------+ |
| """ |
| if scale is None: |
| return _invoke_function_over_columns("ceil", col) |
| else: |
| scale = _enum_to_value(scale) |
| scale = lit(scale) if isinstance(scale, int) else scale |
| return _invoke_function_over_columns("ceil", col, scale) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def ceiling(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: |
| """ |
| Computes the ceiling of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or column name to compute the ceiling on. |
| scale : :class:`~pyspark.sql.Column` or int |
| An optional parameter to control the rounding behavior. |
| |
| .. versionadded:: 4.0.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column for the computed results. |
| |
| Examples |
| -------- |
| Example 1: Compute the ceiling of a column value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.ceiling(sf.lit(-0.1))).show() |
| +-------------+ |
| |ceiling(-0.1)| |
| +-------------+ |
| | 0| |
| +-------------+ |
| |
| Example 2: Compute the ceiling of a column value with a specified scale |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.ceiling(sf.lit(-0.1), 1)).show() |
| +----------------+ |
| |ceiling(-0.1, 1)| |
| +----------------+ |
| | -0.1| |
| +----------------+ |
| """ |
| if scale is None: |
| return _invoke_function_over_columns("ceiling", col) |
| else: |
| scale = _enum_to_value(scale) |
| scale = lit(scale) if isinstance(scale, int) else scale |
| return _invoke_function_over_columns("ceiling", col, scale) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def cos(col: "ColumnOrName") -> Column: |
| """ |
| Computes cosine of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| angle in radians |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| cosine of the angle, as if computed by `java.lang.Math.cos()`. |
| |
| Examples |
| -------- |
| Example 1: Compute the cosine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (PI()), (PI() / 4), (PI() / 16) AS TAB(value)" |
| ... ).select("*", sf.cos("value")).show() |
| +-------------------+------------------+ |
| | value| COS(value)| |
| +-------------------+------------------+ |
| | 3.141592653589...| -1.0| |
| | 0.7853981633974...|0.7071067811865...| |
| |0.19634954084936...|0.9807852804032...| |
| +-------------------+------------------+ |
| |
| Example 2: Compute the cosine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.cos("value")).show() |
| +-----+----------+ |
| |value|COS(value)| |
| +-----+----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("cos", col) |
| |
| |
| @_try_remote_functions |
| def cosh(col: "ColumnOrName") -> Column: |
| """ |
| Computes hyperbolic cosine of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| hyperbolic angle |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()` |
| |
| Examples |
| -------- |
| Example 1: Compute the cosine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ["value"]) |
| >>> df.select("*", sf.cosh(df.value)).show() |
| +-----+-----------------+ |
| |value| COSH(value)| |
| +-----+-----------------+ |
| | -1|1.543080634815...| |
| | 0| 1.0| |
| | 1|1.543080634815...| |
| +-----+-----------------+ |
| |
| Example 2: Compute the cosine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.cosh("value")).show() |
| +-----+-----------+ |
| |value|COSH(value)| |
| +-----+-----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("cosh", col) |
| |
| |
| @_try_remote_functions |
| def cot(col: "ColumnOrName") -> Column: |
| """ |
| Computes cotangent of the input column. |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| angle in radians. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| cotangent of the angle. |
| |
| Examples |
| -------- |
| Example 1: Compute the cotangent |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (PI() / 4), (PI() / 16) AS TAB(value)" |
| ... ).select("*", sf.cot("value")).show() |
| +-------------------+------------------+ |
| | value| COT(value)| |
| +-------------------+------------------+ |
| | 0.7853981633974...|1.0000000000000...| |
| |0.19634954084936...| 5.027339492125...| |
| +-------------------+------------------+ |
| |
| Example 2: Compute the cotangent of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0.0), (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.cot("value")).show() |
| +-----+----------+ |
| |value|COT(value)| |
| +-----+----------+ |
| | 0.0| Infinity| |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("cot", col) |
| |
| |
| @_try_remote_functions |
| def csc(col: "ColumnOrName") -> Column: |
| """ |
| Computes cosecant of the input column. |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| angle in radians. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| cosecant of the angle. |
| |
| Examples |
| -------- |
| Example 1: Compute the cosecant |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (PI() / 2), (PI() / 4) AS TAB(value)" |
| ... ).select("*", sf.csc("value")).show() |
| +------------------+------------------+ |
| | value| CSC(value)| |
| +------------------+------------------+ |
| |1.5707963267948...| 1.0| |
| |0.7853981633974...|1.4142135623730...| |
| +------------------+------------------+ |
| |
| Example 2: Compute the cosecant of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0.0), (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.csc("value")).show() |
| +-----+----------+ |
| |value|CSC(value)| |
| +-----+----------+ |
| | 0.0| Infinity| |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("csc", col) |
| |
| |
| @_try_remote_functions |
| def e() -> Column: |
| """Returns Euler's number. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.e()).show() |
| +-----------------+ |
| | E()| |
| +-----------------+ |
| |2.718281828459045| |
| +-----------------+ |
| """ |
| return _invoke_function("e") |
| |
| |
| @_try_remote_functions |
| def exp(col: "ColumnOrName") -> Column: |
| """ |
| Computes the exponential of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to calculate exponential for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| exponential of the given value. |
| |
| Examples |
| -------- |
| Example 1: Compute the exponential |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT id AS value FROM RANGE(5)") |
| >>> df.select("*", sf.exp(df.value)).show() |
| +-----+------------------+ |
| |value| EXP(value)| |
| +-----+------------------+ |
| | 0| 1.0| |
| | 1|2.7182818284590...| |
| | 2| 7.38905609893...| |
| | 3|20.085536923187...| |
| | 4|54.598150033144...| |
| +-----+------------------+ |
| |
| Example 2: Compute the exponential of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.exp("value")).show() |
| +-----+----------+ |
| |value|EXP(value)| |
| +-----+----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("exp", col) |
| |
| |
| @_try_remote_functions |
| def expm1(col: "ColumnOrName") -> Column: |
| """ |
| Computes the exponential of the given value minus one. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to calculate exponential for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| exponential less one. |
| |
| Examples |
| -------- |
| Example 1: Compute the exponential minus one |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT id AS value FROM RANGE(5)") |
| >>> df.select("*", sf.expm1(df.value)).show() |
| +-----+------------------+ |
| |value| EXPM1(value)| |
| +-----+------------------+ |
| | 0| 0.0| |
| | 1| 1.718281828459...| |
| | 2| 6.38905609893...| |
| | 3|19.085536923187...| |
| | 4|53.598150033144...| |
| +-----+------------------+ |
| |
| Example 2: Compute the exponential minus one of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.expm1("value")).show() |
| +-----+------------+ |
| |value|EXPM1(value)| |
| +-----+------------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+------------+ |
| """ |
| return _invoke_function_over_columns("expm1", col) |
| |
| |
| @_try_remote_functions |
| def floor(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: |
| """ |
| Computes the floor of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or column name to compute the floor on. |
| scale : :class:`~pyspark.sql.Column` or int, optional |
| An optional parameter to control the rounding behavior. |
| |
| .. versionadded:: 4.0.0 |
| |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| nearest integer that is less than or equal to given value. |
| |
| Examples |
| -------- |
| Example 1: Compute the floor of a column value |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.floor(sf.lit(2.5))).show() |
| +----------+ |
| |FLOOR(2.5)| |
| +----------+ |
| | 2| |
| +----------+ |
| |
| Example 2: Compute the floor of a column value with a specified scale |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.floor(sf.lit(2.1267), sf.lit(2))).show() |
| +----------------+ |
| |floor(2.1267, 2)| |
| +----------------+ |
| | 2.12| |
| +----------------+ |
| """ |
| if scale is None: |
| return _invoke_function_over_columns("floor", col) |
| else: |
| scale = _enum_to_value(scale) |
| scale = lit(scale) if isinstance(scale, int) else scale |
| return _invoke_function_over_columns("floor", col, scale) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def log(col: "ColumnOrName") -> Column: |
| """ |
| Computes the natural logarithm of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to calculate natural logarithm for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| natural logarithm of the given value. |
| |
| Examples |
| -------- |
| Example 1: Compute the natural logarithm of E |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.log(sf.e())).show() |
| +-------+ |
| |ln(E())| |
| +-------+ |
| | 1.0| |
| +-------+ |
| |
| Example 2: Compute the natural logarithm of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-1), (0), (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.log("value")).show() |
| +-----+---------+ |
| |value|ln(value)| |
| +-----+---------+ |
| | -1.0| NULL| |
| | 0.0| NULL| |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+---------+ |
| """ |
| return _invoke_function_over_columns("log", col) |
| |
| |
| @_try_remote_functions |
| def log10(col: "ColumnOrName") -> Column: |
| """ |
| Computes the logarithm of the given value in Base 10. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to calculate logarithm for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| logarithm of the given value in Base 10. |
| |
| Examples |
| -------- |
| Example 1: Compute the logarithm in Base 10 |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (10,), (100,)], ["value"]) |
| >>> df.select("*", sf.log10(df.value)).show() |
| +-----+------------+ |
| |value|LOG10(value)| |
| +-----+------------+ |
| | 1| 0.0| |
| | 10| 1.0| |
| | 100| 2.0| |
| +-----+------------+ |
| |
| Example 2: Compute the logarithm in Base 10 of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (-1), (0), (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.log10("value")).show() |
| +-----+------------+ |
| |value|LOG10(value)| |
| +-----+------------+ |
| | -1.0| NULL| |
| | 0.0| NULL| |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+------------+ |
| """ |
| return _invoke_function_over_columns("log10", col) |
| |
| |
| @_try_remote_functions |
| def log1p(col: "ColumnOrName") -> Column: |
| """ |
| Computes the natural logarithm of the given value plus one. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to calculate natural logarithm for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| natural logarithm of the "given value plus one". |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.log1p(sf.e())).show() |
| +------------------+ |
| | LOG1P(E())| |
| +------------------+ |
| |1.3132616875182...| |
| +------------------+ |
| |
| Same as: |
| |
| >>> spark.range(1).select(sf.log(sf.e() + 1)).show() |
| +------------------+ |
| | ln((E() + 1))| |
| +------------------+ |
| |1.3132616875182...| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("log1p", col) |
| |
| |
| @_try_remote_functions |
| def negative(col: "ColumnOrName") -> Column: |
| """ |
| Returns the negative value. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to calculate negative value for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| negative value. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ["value"]) |
| >>> df.select("*", sf.negative(df.value)).show() |
| +-----+---------------+ |
| |value|negative(value)| |
| +-----+---------------+ |
| | -1| 1| |
| | 0| 0| |
| | 1| -1| |
| +-----+---------------+ |
| """ |
| return _invoke_function_over_columns("negative", col) |
| |
| |
| negate = negative |
| |
| |
| @_try_remote_functions |
| def pi() -> Column: |
| """Returns Pi. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.pi()).show() |
| +-----------------+ |
| | PI()| |
| +-----------------+ |
| |3.141592653589793| |
| +-----------------+ |
| """ |
| return _invoke_function("pi") |
| |
| |
| @_try_remote_functions |
| def positive(col: "ColumnOrName") -> Column: |
| """ |
| Returns the value. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input value column. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ["value"]) |
| >>> df.select("*", sf.positive(df.value)).show() |
| +-----+---------+ |
| |value|(+ value)| |
| +-----+---------+ |
| | -1| -1| |
| | 0| 0| |
| | 1| 1| |
| +-----+---------+ |
| """ |
| return _invoke_function_over_columns("positive", col) |
| |
| |
| @_try_remote_functions |
| def rint(col: "ColumnOrName") -> Column: |
| """ |
| Returns the double value that is closest in value to the argument and |
| is equal to a mathematical integer. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.rint(sf.lit(10.6))).show() |
| +----------+ |
| |rint(10.6)| |
| +----------+ |
| | 11.0| |
| +----------+ |
| |
| >>> spark.range(1).select(sf.rint(sf.lit(10.3))).show() |
| +----------+ |
| |rint(10.3)| |
| +----------+ |
| | 10.0| |
| +----------+ |
| """ |
| return _invoke_function_over_columns("rint", col) |
| |
| |
| @_try_remote_functions |
| def sec(col: "ColumnOrName") -> Column: |
| """ |
| Computes secant of the input column. |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Angle in radians |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Secant of the angle. |
| |
| Examples |
| -------- |
| Example 1: Compute the secant |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (PI() / 4), (PI() / 16) AS TAB(value)" |
| ... ).select("*", sf.sec("value")).show() |
| +-------------------+------------------+ |
| | value| SEC(value)| |
| +-------------------+------------------+ |
| | 0.7853981633974...| 1.414213562373...| |
| |0.19634954084936...|1.0195911582083...| |
| +-------------------+------------------+ |
| |
| Example 2: Compute the secant of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.sec("value")).show() |
| +-----+----------+ |
| |value|SEC(value)| |
| +-----+----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("sec", col) |
| |
| |
| @_try_remote_functions |
| def signum(col: "ColumnOrName") -> Column: |
| """ |
| Computes the signum of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select( |
| ... sf.signum(sf.lit(-5)), |
| ... sf.signum(sf.lit(6)), |
| ... sf.signum(sf.lit(float('nan'))), |
| ... sf.signum(sf.lit(None)) |
| ... ).show() |
| +----------+---------+-----------+------------+ |
| |SIGNUM(-5)|SIGNUM(6)|SIGNUM(NaN)|SIGNUM(NULL)| |
| +----------+---------+-----------+------------+ |
| | -1.0| 1.0| NaN| NULL| |
| +----------+---------+-----------+------------+ |
| """ |
| return _invoke_function_over_columns("signum", col) |
| |
| |
| @_try_remote_functions |
| def sign(col: "ColumnOrName") -> Column: |
| """ |
| Computes the signum of the given value. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select( |
| ... sf.sign(sf.lit(-5)), |
| ... sf.sign(sf.lit(6)), |
| ... sf.sign(sf.lit(float('nan'))), |
| ... sf.sign(sf.lit(None)) |
| ... ).show() |
| +--------+-------+---------+----------+ |
| |sign(-5)|sign(6)|sign(NaN)|sign(NULL)| |
| +--------+-------+---------+----------+ |
| | -1.0| 1.0| NaN| NULL| |
| +--------+-------+---------+----------+ |
| """ |
| return _invoke_function_over_columns("sign", col) |
| |
| |
| @_try_remote_functions |
| def sin(col: "ColumnOrName") -> Column: |
| """ |
| Computes sine of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| sine of the angle, as if computed by `java.lang.Math.sin()` |
| |
| Examples |
| -------- |
| Example 1: Compute the sine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0.0), (PI() / 2), (PI() / 4) AS TAB(value)" |
| ... ).select("*", sf.sin("value")).show() |
| +------------------+------------------+ |
| | value| SIN(value)| |
| +------------------+------------------+ |
| | 0.0| 0.0| |
| |1.5707963267948...| 1.0| |
| |0.7853981633974...|0.7071067811865...| |
| +------------------+------------------+ |
| |
| Example 2: Compute the sine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.sin("value")).show() |
| +-----+----------+ |
| |value|SIN(value)| |
| +-----+----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("sin", col) |
| |
| |
| @_try_remote_functions |
| def sinh(col: "ColumnOrName") -> Column: |
| """ |
| Computes hyperbolic sine of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| hyperbolic angle. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hyperbolic sine of the given value, |
| as if computed by `java.lang.Math.sinh()` |
| |
| Examples |
| -------- |
| Example 1: Compute the hyperbolic sine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ["value"]) |
| >>> df.select("*", sf.sinh(df.value)).show() |
| +-----+-------------------+ |
| |value| SINH(value)| |
| +-----+-------------------+ |
| | -1|-1.1752011936438...| |
| | 0| 0.0| |
| | 1| 1.1752011936438...| |
| +-----+-------------------+ |
| |
| Example 2: Compute the hyperbolic sine of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.sinh("value")).show() |
| +-----+-----------+ |
| |value|SINH(value)| |
| +-----+-----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("sinh", col) |
| |
| |
| @_try_remote_functions |
| def tan(col: "ColumnOrName") -> Column: |
| """ |
| Computes tangent of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| angle in radians |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| tangent of the given value, as if computed by `java.lang.Math.tan()` |
| |
| Examples |
| -------- |
| Example 1: Compute the tangent |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0.0), (PI() / 4), (PI() / 6) AS TAB(value)" |
| ... ).select("*", sf.tan("value")).show() |
| +------------------+------------------+ |
| | value| TAN(value)| |
| +------------------+------------------+ |
| | 0.0| 0.0| |
| |0.7853981633974...|0.9999999999999...| |
| |0.5235987755982...|0.5773502691896...| |
| +------------------+------------------+ |
| |
| Example 2: Compute the tangent of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.tan("value")).show() |
| +-----+----------+ |
| |value|TAN(value)| |
| +-----+----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("tan", col) |
| |
| |
| @_try_remote_functions |
| def tanh(col: "ColumnOrName") -> Column: |
| """ |
| Computes hyperbolic tangent of the input column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| hyperbolic angle |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hyperbolic tangent of the given value |
| as if computed by `java.lang.Math.tanh()` |
| |
| Examples |
| -------- |
| Example 1: Compute the hyperbolic tangent sine |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ["value"]) |
| >>> df.select("*", sf.tanh(df.value)).show() |
| +-----+-------------------+ |
| |value| TANH(value)| |
| +-----+-------------------+ |
| | -1|-0.7615941559557...| |
| | 0| 0.0| |
| | 1| 0.7615941559557...| |
| +-----+-------------------+ |
| |
| Example 2: Compute the hyperbolic tangent of invalid values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (FLOAT('NAN')), (NULL) AS TAB(value)" |
| ... ).select("*", sf.tanh("value")).show() |
| +-----+-----------+ |
| |value|TANH(value)| |
| +-----+-----------+ |
| | NaN| NaN| |
| | NULL| NULL| |
| +-----+-----------+ |
| """ |
| return _invoke_function_over_columns("tanh", col) |
| |
| |
| @_try_remote_functions |
| def toDegrees(col: "ColumnOrName") -> Column: |
| """ |
| Converts an angle measured in radians to an approximately equivalent angle |
| measured in degrees. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 2.1.0 |
| Use :func:`degrees` instead. |
| """ |
| warnings.warn("Deprecated in 2.1, use degrees instead.", FutureWarning) |
| return degrees(col) |
| |
| |
| @_try_remote_functions |
| def toRadians(col: "ColumnOrName") -> Column: |
| """ |
| Converts an angle measured in degrees to an approximately equivalent angle |
| measured in radians. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 2.1.0 |
| Use :func:`radians` instead. |
| """ |
| warnings.warn("Deprecated in 2.1, use radians instead.", FutureWarning) |
| return radians(col) |
| |
| |
| @_try_remote_functions |
| def bitwiseNOT(col: "ColumnOrName") -> Column: |
| """ |
| Computes bitwise not. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 3.2.0 |
| Use :func:`bitwise_not` instead. |
| """ |
| warnings.warn("Deprecated in 3.2, use bitwise_not instead.", FutureWarning) |
| return bitwise_not(col) |
| |
| |
| @_try_remote_functions |
| def bitwise_not(col: "ColumnOrName") -> Column: |
| """ |
| Computes bitwise not. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0), (1), (2), (3), (NULL) AS TAB(value)" |
| ... ).select("*", sf.bitwise_not("value")).show() |
| +-----+------+ |
| |value|~value| |
| +-----+------+ |
| | 0| -1| |
| | 1| -2| |
| | 2| -3| |
| | 3| -4| |
| | NULL| NULL| |
| +-----+------+ |
| """ |
| return _invoke_function_over_columns("bitwise_not", col) |
| |
| |
| @_try_remote_functions |
| def bit_count(col: "ColumnOrName") -> Column: |
| """ |
| Returns the number of bits that are set in the argument expr as an unsigned 64-bit integer, |
| or NULL if the argument is NULL. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of bits that are set in the argument expr as an unsigned 64-bit integer, |
| or NULL if the argument is NULL. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bit_get` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0), (1), (2), (3), (NULL) AS TAB(value)" |
| ... ).select("*", sf.bit_count("value")).show() |
| +-----+----------------+ |
| |value|bit_count(value)| |
| +-----+----------------+ |
| | 0| 0| |
| | 1| 1| |
| | 2| 1| |
| | 3| 2| |
| | NULL| NULL| |
| +-----+----------------+ |
| """ |
| return _invoke_function_over_columns("bit_count", col) |
| |
| |
| @_try_remote_functions |
| def bit_get(col: "ColumnOrName", pos: "ColumnOrName") -> Column: |
| """ |
| Returns the value of the bit (0 or 1) at the specified position. |
| The positions are numbered from right to left, starting at zero. |
| The position argument cannot be negative. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| pos : :class:`~pyspark.sql.Column` or column name |
| The positions are numbered from right to left, starting at zero. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the value of the bit (0 or 1) at the specified position. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bit_count` |
| :meth:`pyspark.sql.functions.getbit` |
| |
| Examples |
| -------- |
| Example 1: Get the bit with a literal position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[2],[3],[None]], ["value"]) |
| >>> df.select("*", sf.bit_get("value", sf.lit(1))).show() |
| +-----+-----------------+ |
| |value|bit_get(value, 1)| |
| +-----+-----------------+ |
| | 1| 0| |
| | 2| 1| |
| | 3| 1| |
| | NULL| NULL| |
| +-----+-----------------+ |
| |
| Example 2: Get the bit with a column position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1,2],[2,1],[3,None],[None,1]], ["value", "pos"]) |
| >>> df.select("*", sf.bit_get(df.value, "pos")).show() |
| +-----+----+-------------------+ |
| |value| pos|bit_get(value, pos)| |
| +-----+----+-------------------+ |
| | 1| 2| 0| |
| | 2| 1| 1| |
| | 3|NULL| NULL| |
| | NULL| 1| NULL| |
| +-----+----+-------------------+ |
| """ |
| return _invoke_function_over_columns("bit_get", col, pos) |
| |
| |
| @_try_remote_functions |
| def getbit(col: "ColumnOrName", pos: "ColumnOrName") -> Column: |
| """ |
| Returns the value of the bit (0 or 1) at the specified position. |
| The positions are numbered from right to left, starting at zero. |
| The position argument cannot be negative. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| pos : :class:`~pyspark.sql.Column` or column name |
| The positions are numbered from right to left, starting at zero. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the value of the bit (0 or 1) at the specified position. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bit_get` |
| |
| Examples |
| -------- |
| Example 1: Get the bit with a literal position |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[1], [2], [3], [None]], ["value"] |
| ... ).select("*", sf.getbit("value", sf.lit(1))).show() |
| +-----+----------------+ |
| |value|getbit(value, 1)| |
| +-----+----------------+ |
| | 1| 0| |
| | 2| 1| |
| | 3| 1| |
| | NULL| NULL| |
| +-----+----------------+ |
| |
| Example 2: Get the bit with a column position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1,2],[2,1],[3,None],[None,1]], ["value", "pos"]) |
| >>> df.select("*", sf.getbit(df.value, "pos")).show() |
| +-----+----+------------------+ |
| |value| pos|getbit(value, pos)| |
| +-----+----+------------------+ |
| | 1| 2| 0| |
| | 2| 1| 1| |
| | 3|NULL| NULL| |
| | NULL| 1| NULL| |
| +-----+----+------------------+ |
| """ |
| return _invoke_function_over_columns("getbit", col, pos) |
| |
| |
| @_try_remote_functions |
| def asc_nulls_first(col: "ColumnOrName") -> Column: |
| """ |
| Sort Function: Returns a sort expression based on the ascending order of the given |
| column name, and null values return before non-null values. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to sort by in the ascending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column specifying the order. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.asc` |
| :meth:`pyspark.sql.functions.asc_nulls_last` |
| |
| Examples |
| -------- |
| Example 1: Sorting a DataFrame with null values in ascending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, "Bob"), (0, None), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.asc_nulls_first(df.name)).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 0| NULL| |
| | 2|Alice| |
| | 1| Bob| |
| +---+-----+ |
| |
| Example 2: Sorting a DataFrame with multiple columns, null values in ascending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(1, "Bob", None), (0, None, "Z"), (2, "Alice", "Y")], ["age", "name", "grade"]) |
| >>> df.sort(sf.asc_nulls_first(df.name), sf.asc_nulls_first(df.grade)).show() |
| +---+-----+-----+ |
| |age| name|grade| |
| +---+-----+-----+ |
| | 0| NULL| Z| |
| | 2|Alice| Y| |
| | 1| Bob| NULL| |
| +---+-----+-----+ |
| |
| Example 3: Sorting a DataFrame with null values in ascending order using column name string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, "Bob"), (0, None), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.asc_nulls_first("name")).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 0| NULL| |
| | 2|Alice| |
| | 1| Bob| |
| +---+-----+ |
| """ |
| return ( |
| col.asc_nulls_first() |
| if isinstance(col, Column) |
| else _invoke_function("asc_nulls_first", col) |
| ) |
| |
| |
| @_try_remote_functions |
| def asc_nulls_last(col: "ColumnOrName") -> Column: |
| """ |
| Sort Function: Returns a sort expression based on the ascending order of the given |
| column name, and null values appear after non-null values. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to sort by in the ascending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column specifying the order. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.asc` |
| :meth:`pyspark.sql.functions.asc_nulls_first` |
| |
| Examples |
| -------- |
| Example 1: Sorting a DataFrame with null values in ascending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.asc_nulls_last(df.name)).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 2|Alice| |
| | 1| Bob| |
| | 0| NULL| |
| +---+-----+ |
| |
| Example 2: Sorting a DataFrame with multiple columns, null values in ascending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(0, None, "Z"), (1, "Bob", None), (2, "Alice", "Y")], ["age", "name", "grade"]) |
| >>> df.sort(sf.asc_nulls_last(df.name), sf.asc_nulls_last(df.grade)).show() |
| +---+-----+-----+ |
| |age| name|grade| |
| +---+-----+-----+ |
| | 2|Alice| Y| |
| | 1| Bob| NULL| |
| | 0| NULL| Z| |
| +---+-----+-----+ |
| |
| Example 3: Sorting a DataFrame with null values in ascending order using column name string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.asc_nulls_last("name")).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 2|Alice| |
| | 1| Bob| |
| | 0| NULL| |
| +---+-----+ |
| """ |
| return ( |
| col.asc_nulls_last() if isinstance(col, Column) else _invoke_function("asc_nulls_last", col) |
| ) |
| |
| |
| @_try_remote_functions |
| def desc_nulls_first(col: "ColumnOrName") -> Column: |
| """ |
| Sort Function: Returns a sort expression based on the descending order of the given |
| column name, and null values appear before non-null values. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to sort by in the descending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column specifying the order. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.desc` |
| :meth:`pyspark.sql.functions.desc_nulls_last` |
| |
| Examples |
| -------- |
| Example 1: Sorting a DataFrame with null values in descending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, "Bob"), (0, None), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.desc_nulls_first(df.name)).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 0| NULL| |
| | 1| Bob| |
| | 2|Alice| |
| +---+-----+ |
| |
| Example 2: Sorting a DataFrame with multiple columns, null values in descending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(1, "Bob", None), (0, None, "Z"), (2, "Alice", "Y")], ["age", "name", "grade"]) |
| >>> df.sort(sf.desc_nulls_first(df.name), sf.desc_nulls_first(df.grade)).show() |
| +---+-----+-----+ |
| |age| name|grade| |
| +---+-----+-----+ |
| | 0| NULL| Z| |
| | 1| Bob| NULL| |
| | 2|Alice| Y| |
| +---+-----+-----+ |
| |
| Example 3: Sorting a DataFrame with null values in descending order using column name string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, "Bob"), (0, None), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.desc_nulls_first("name")).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 0| NULL| |
| | 1| Bob| |
| | 2|Alice| |
| +---+-----+ |
| """ |
| return ( |
| col.desc_nulls_first() |
| if isinstance(col, Column) |
| else _invoke_function("desc_nulls_first", col) |
| ) |
| |
| |
| @_try_remote_functions |
| def desc_nulls_last(col: "ColumnOrName") -> Column: |
| """ |
| Sort Function: Returns a sort expression based on the descending order of the given |
| column name, and null values appear after non-null values. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to sort by in the descending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column specifying the order. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.desc` |
| :meth:`pyspark.sql.functions.desc_nulls_first` |
| |
| Examples |
| -------- |
| Example 1: Sorting a DataFrame with null values in descending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.desc_nulls_last(df.name)).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 1| Bob| |
| | 2|Alice| |
| | 0| NULL| |
| +---+-----+ |
| |
| Example 2: Sorting a DataFrame with multiple columns, null values in descending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(0, None, "Z"), (1, "Bob", None), (2, "Alice", "Y")], ["age", "name", "grade"]) |
| >>> df.sort(sf.desc_nulls_last(df.name), sf.desc_nulls_last(df.grade)).show() |
| +---+-----+-----+ |
| |age| name|grade| |
| +---+-----+-----+ |
| | 1| Bob| NULL| |
| | 2|Alice| Y| |
| | 0| NULL| Z| |
| +---+-----+-----+ |
| |
| Example 3: Sorting a DataFrame with null values in descending order using column name string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(0, None), (1, "Bob"), (2, "Alice")], ["age", "name"]) |
| >>> df.sort(sf.desc_nulls_last("name")).show() |
| +---+-----+ |
| |age| name| |
| +---+-----+ |
| | 1| Bob| |
| | 2|Alice| |
| | 0| NULL| |
| +---+-----+ |
| """ |
| return ( |
| col.desc_nulls_last() |
| if isinstance(col, Column) |
| else _invoke_function("desc_nulls_last", col) |
| ) |
| |
| |
| @_try_remote_functions |
| def stddev(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: alias for stddev_samp. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.std` |
| :meth:`pyspark.sql.functions.stddev_pop` |
| :meth:`pyspark.sql.functions.stddev_samp` |
| :meth:`pyspark.sql.functions.variance` |
| :meth:`pyspark.sql.functions.skewness` |
| :meth:`pyspark.sql.functions.kurtosis` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| standard deviation of given column. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(6).select(sf.stddev("id")).show() |
| +------------------+ |
| | stddev(id)| |
| +------------------+ |
| |1.8708286933869...| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("stddev", col) |
| |
| |
| @_try_remote_functions |
| def std(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: alias for stddev_samp. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| standard deviation of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.stddev` |
| :meth:`pyspark.sql.functions.stddev_pop` |
| :meth:`pyspark.sql.functions.stddev_samp` |
| :meth:`pyspark.sql.functions.variance` |
| :meth:`pyspark.sql.functions.skewness` |
| :meth:`pyspark.sql.functions.kurtosis` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(6).select(sf.std("id")).show() |
| +------------------+ |
| | std(id)| |
| +------------------+ |
| |1.8708286933869...| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("std", col) |
| |
| |
| @_try_remote_functions |
| def stddev_samp(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the unbiased sample standard deviation of |
| the expression in a group. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| standard deviation of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.std` |
| :meth:`pyspark.sql.functions.stddev` |
| :meth:`pyspark.sql.functions.stddev_pop` |
| :meth:`pyspark.sql.functions.var_samp` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(6).select(sf.stddev_samp("id")).show() |
| +------------------+ |
| | stddev_samp(id)| |
| +------------------+ |
| |1.8708286933869...| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("stddev_samp", col) |
| |
| |
| @_try_remote_functions |
| def stddev_pop(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns population standard deviation of |
| the expression in a group. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| standard deviation of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.std` |
| :meth:`pyspark.sql.functions.stddev` |
| :meth:`pyspark.sql.functions.stddev_samp` |
| :meth:`pyspark.sql.functions.var_pop` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(6).select(sf.stddev_pop("id")).show() |
| +-----------------+ |
| | stddev_pop(id)| |
| +-----------------+ |
| |1.707825127659...| |
| +-----------------+ |
| """ |
| return _invoke_function_over_columns("stddev_pop", col) |
| |
| |
| @_try_remote_functions |
| def variance(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: alias for var_samp |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| variance of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.var_pop` |
| :meth:`pyspark.sql.functions.var_samp` |
| :meth:`pyspark.sql.functions.stddev` |
| :meth:`pyspark.sql.functions.skewness` |
| :meth:`pyspark.sql.functions.kurtosis` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(6) |
| >>> df.select(sf.variance(df.id)).show() |
| +------------+ |
| |variance(id)| |
| +------------+ |
| | 3.5| |
| +------------+ |
| """ |
| return _invoke_function_over_columns("variance", col) |
| |
| |
| @_try_remote_functions |
| def var_samp(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the unbiased sample variance of |
| the values in a group. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| variance of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.variance` |
| :meth:`pyspark.sql.functions.var_pop` |
| :meth:`pyspark.sql.functions.std_samp` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(6) |
| >>> df.select(sf.var_samp(df.id)).show() |
| +------------+ |
| |var_samp(id)| |
| +------------+ |
| | 3.5| |
| +------------+ |
| """ |
| return _invoke_function_over_columns("var_samp", col) |
| |
| |
| @_try_remote_functions |
| def var_pop(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the population variance of the values in a group. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| variance of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.variance` |
| :meth:`pyspark.sql.functions.var_samp` |
| :meth:`pyspark.sql.functions.std_pop` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(6) |
| >>> df.select(sf.var_pop(df.id)).show() |
| +------------------+ |
| | var_pop(id)| |
| +------------------+ |
| |2.9166666666666...| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("var_pop", col) |
| |
| |
| @_try_remote_functions |
| def regr_avgx(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the average of the independent variable for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the average of the independent variable for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show() |
| +---------------+------+ |
| |regr_avgx(y, x)|avg(x)| |
| +---------------+------+ |
| | 2.75| 2.75| |
| +---------------+------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show() |
| +---------------+------+ |
| |regr_avgx(y, x)|avg(x)| |
| +---------------+------+ |
| | NULL| NULL| |
| +---------------+------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show() |
| +---------------+------+ |
| |regr_avgx(y, x)|avg(x)| |
| +---------------+------+ |
| | NULL| 1.0| |
| +---------------+------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show() |
| +---------------+------+ |
| |regr_avgx(y, x)|avg(x)| |
| +---------------+------+ |
| | 3.0| 3.0| |
| +---------------+------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show() |
| +---------------+------+ |
| |regr_avgx(y, x)|avg(x)| |
| +---------------+------+ |
| | 3.0| 3.0| |
| +---------------+------+ |
| """ |
| return _invoke_function_over_columns("regr_avgx", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_avgy(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the average of the dependent variable for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the average of the dependent variable for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show() |
| +---------------+------+ |
| |regr_avgy(y, x)|avg(y)| |
| +---------------+------+ |
| | 1.75| 1.75| |
| +---------------+------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show() |
| +---------------+------+ |
| |regr_avgy(y, x)|avg(y)| |
| +---------------+------+ |
| | NULL| 1.0| |
| +---------------+------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show() |
| +---------------+------+ |
| |regr_avgy(y, x)|avg(y)| |
| +---------------+------+ |
| | NULL| NULL| |
| +---------------+------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show() |
| +------------------+------+ |
| | regr_avgy(y, x)|avg(y)| |
| +------------------+------+ |
| |1.6666666666666...| 1.75| |
| +------------------+------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show() |
| +---------------+------------------+ |
| |regr_avgy(y, x)| avg(y)| |
| +---------------+------------------+ |
| | 1.5|1.6666666666666...| |
| +---------------+------------------+ |
| """ |
| return _invoke_function_over_columns("regr_avgy", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_count(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the number of non-null number pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of non-null number pairs in a group. |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show() |
| +----------------+--------+ |
| |regr_count(y, x)|count(0)| |
| +----------------+--------+ |
| | 4| 4| |
| +----------------+--------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show() |
| +----------------+--------+ |
| |regr_count(y, x)|count(0)| |
| +----------------+--------+ |
| | 0| 1| |
| +----------------+--------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show() |
| +----------------+--------+ |
| |regr_count(y, x)|count(0)| |
| +----------------+--------+ |
| | 0| 1| |
| +----------------+--------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show() |
| +----------------+--------+ |
| |regr_count(y, x)|count(0)| |
| +----------------+--------+ |
| | 3| 4| |
| +----------------+--------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show() |
| +----------------+--------+ |
| |regr_count(y, x)|count(0)| |
| +----------------+--------+ |
| | 2| 4| |
| +----------------+--------+ |
| """ |
| return _invoke_function_over_columns("regr_count", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_intercept(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the intercept of the univariate linear regression line |
| for non-null pairs in a group, where `y` is the dependent variable and |
| `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the intercept of the univariate linear regression line for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_intercept("y", "x")).show() |
| +--------------------+ |
| |regr_intercept(y, x)| |
| +--------------------+ |
| | 0.0| |
| +--------------------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_intercept("y", "x")).show() |
| +--------------------+ |
| |regr_intercept(y, x)| |
| +--------------------+ |
| | NULL| |
| +--------------------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_intercept("y", "x")).show() |
| +--------------------+ |
| |regr_intercept(y, x)| |
| +--------------------+ |
| | NULL| |
| +--------------------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_intercept("y", "x")).show() |
| +--------------------+ |
| |regr_intercept(y, x)| |
| +--------------------+ |
| | 0.0| |
| +--------------------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_intercept("y", "x")).show() |
| +--------------------+ |
| |regr_intercept(y, x)| |
| +--------------------+ |
| | 0.0| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("regr_intercept", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_r2(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the coefficient of determination for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the coefficient of determination for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_r2("y", "x")).show() |
| +-------------+ |
| |regr_r2(y, x)| |
| +-------------+ |
| | 1.0| |
| +-------------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_r2("y", "x")).show() |
| +-------------+ |
| |regr_r2(y, x)| |
| +-------------+ |
| | NULL| |
| +-------------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_r2("y", "x")).show() |
| +-------------+ |
| |regr_r2(y, x)| |
| +-------------+ |
| | NULL| |
| +-------------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_r2("y", "x")).show() |
| +-------------+ |
| |regr_r2(y, x)| |
| +-------------+ |
| | 1.0| |
| +-------------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_r2("y", "x")).show() |
| +-------------+ |
| |regr_r2(y, x)| |
| +-------------+ |
| | 1.0| |
| +-------------+ |
| """ |
| return _invoke_function_over_columns("regr_r2", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_slope(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the slope of the linear regression line for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the slope of the linear regression line for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_slope("y", "x")).show() |
| +----------------+ |
| |regr_slope(y, x)| |
| +----------------+ |
| | 1.0| |
| +----------------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_slope("y", "x")).show() |
| +----------------+ |
| |regr_slope(y, x)| |
| +----------------+ |
| | NULL| |
| +----------------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_slope("y", "x")).show() |
| +----------------+ |
| |regr_slope(y, x)| |
| +----------------+ |
| | NULL| |
| +----------------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_slope("y", "x")).show() |
| +----------------+ |
| |regr_slope(y, x)| |
| +----------------+ |
| | 1.0| |
| +----------------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_slope("y", "x")).show() |
| +----------------+ |
| |regr_slope(y, x)| |
| +----------------+ |
| | 1.0| |
| +----------------+ |
| """ |
| return _invoke_function_over_columns("regr_slope", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_sxx(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_sxx("y", "x")).show() |
| +--------------+ |
| |regr_sxx(y, x)| |
| +--------------+ |
| | 5.0| |
| +--------------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_sxx("y", "x")).show() |
| +--------------+ |
| |regr_sxx(y, x)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_sxx("y", "x")).show() |
| +--------------+ |
| |regr_sxx(y, x)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_sxx("y", "x")).show() |
| +-----------------+ |
| | regr_sxx(y, x)| |
| +-----------------+ |
| |4.666666666666...| |
| +-----------------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_sxx("y", "x")).show() |
| +--------------+ |
| |regr_sxx(y, x)| |
| +--------------+ |
| | 4.5| |
| +--------------+ |
| """ |
| return _invoke_function_over_columns("regr_sxx", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_sxy(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_syy` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs in a group. |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_sxy("y", "x")).show() |
| +--------------+ |
| |regr_sxy(y, x)| |
| +--------------+ |
| | 5.0| |
| +--------------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_sxy("y", "x")).show() |
| +--------------+ |
| |regr_sxy(y, x)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_sxy("y", "x")).show() |
| +--------------+ |
| |regr_sxy(y, x)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_sxy("y", "x")).show() |
| +-----------------+ |
| | regr_sxy(y, x)| |
| +-----------------+ |
| |4.666666666666...| |
| +-----------------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_sxy("y", "x")).show() |
| +--------------+ |
| |regr_sxy(y, x)| |
| +--------------+ |
| | 4.5| |
| +--------------+ |
| """ |
| return _invoke_function_over_columns("regr_sxy", y, x) |
| |
| |
| @_try_remote_functions |
| def regr_syy(y: "ColumnOrName", x: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs |
| in a group, where `y` is the dependent variable and `x` is the independent variable. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| y : :class:`~pyspark.sql.Column` or column name |
| the dependent variable. |
| x : :class:`~pyspark.sql.Column` or column name |
| the independent variable. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs in a group. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regr_avgx` |
| :meth:`pyspark.sql.functions.regr_avgy` |
| :meth:`pyspark.sql.functions.regr_count` |
| :meth:`pyspark.sql.functions.regr_intercept` |
| :meth:`pyspark.sql.functions.regr_r2` |
| :meth:`pyspark.sql.functions.regr_slope` |
| :meth:`pyspark.sql.functions.regr_sxy` |
| |
| Examples |
| -------- |
| Example 1: All pairs are non-null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_syy("y", "x")).show() |
| +--------------+ |
| |regr_syy(y, x)| |
| +--------------+ |
| | 5.0| |
| +--------------+ |
| |
| Example 2: All pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)") |
| >>> df.select(sf.regr_syy("y", "x")).show() |
| +--------------+ |
| |regr_syy(y, x)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| |
| Example 3: All pairs' y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)") |
| >>> df.select(sf.regr_syy("y", "x")).show() |
| +--------------+ |
| |regr_syy(y, x)| |
| +--------------+ |
| | NULL| |
| +--------------+ |
| |
| Example 4: Some pairs' x values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_syy("y", "x")).show() |
| +-----------------+ |
| | regr_syy(y, x)| |
| +-----------------+ |
| |4.666666666666...| |
| +-----------------+ |
| |
| Example 5: Some pairs' x or y values are null |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4, 4) AS tab(y, x)") |
| >>> df.select(sf.regr_syy("y", "x")).show() |
| +--------------+ |
| |regr_syy(y, x)| |
| +--------------+ |
| | 4.5| |
| +--------------+ |
| """ |
| return _invoke_function_over_columns("regr_syy", y, x) |
| |
| |
| @_try_remote_functions |
| def every(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns true if all values of `col` are true. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to check if all values are true. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.some` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if all values of `col` are true, false otherwise. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[True], [True], [True]], ["flag"] |
| ... ).select(sf.every("flag")).show() |
| +-----------+ |
| |every(flag)| |
| +-----------+ |
| | true| |
| +-----------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[True], [False], [True]], ["flag"] |
| ... ).select(sf.every("flag")).show() |
| +-----------+ |
| |every(flag)| |
| +-----------+ |
| | false| |
| +-----------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[False], [False], [False]], ["flag"] |
| ... ).select(sf.every("flag")).show() |
| +-----------+ |
| |every(flag)| |
| +-----------+ |
| | false| |
| +-----------+ |
| """ |
| return _invoke_function_over_columns("every", col) |
| |
| |
| @_try_remote_functions |
| def bool_and(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns true if all values of `col` are true. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to check if all values are true. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if all values of `col` are true, false otherwise. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bool_or` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[True], [True], [True]], ["flag"]) |
| >>> df.select(sf.bool_and("flag")).show() |
| +--------------+ |
| |bool_and(flag)| |
| +--------------+ |
| | true| |
| +--------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[True], [False], [True]], ["flag"]) |
| >>> df.select(sf.bool_and("flag")).show() |
| +--------------+ |
| |bool_and(flag)| |
| +--------------+ |
| | false| |
| +--------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[False], [False], [False]], ["flag"]) |
| >>> df.select(sf.bool_and("flag")).show() |
| +--------------+ |
| |bool_and(flag)| |
| +--------------+ |
| | false| |
| +--------------+ |
| """ |
| return _invoke_function_over_columns("bool_and", col) |
| |
| |
| @_try_remote_functions |
| def some(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns true if at least one value of `col` is true. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to check if at least one value is true. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if at least one value of `col` is true, false otherwise. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.every` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[True], [True], [True]], ["flag"] |
| ... ).select(sf.some("flag")).show() |
| +----------+ |
| |some(flag)| |
| +----------+ |
| | true| |
| +----------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[True], [False], [True]], ["flag"] |
| ... ).select(sf.some("flag")).show() |
| +----------+ |
| |some(flag)| |
| +----------+ |
| | true| |
| +----------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [[False], [False], [False]], ["flag"] |
| ... ).select(sf.some("flag")).show() |
| +----------+ |
| |some(flag)| |
| +----------+ |
| | false| |
| +----------+ |
| """ |
| return _invoke_function_over_columns("some", col) |
| |
| |
| @_try_remote_functions |
| def bool_or(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns true if at least one value of `col` is true. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to check if at least one value is true. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if at least one value of `col` is true, false otherwise. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bool_and` |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([[True], [True], [True]], ["flag"]) |
| >>> df.select(bool_or("flag")).show() |
| +-------------+ |
| |bool_or(flag)| |
| +-------------+ |
| | true| |
| +-------------+ |
| >>> df = spark.createDataFrame([[True], [False], [True]], ["flag"]) |
| >>> df.select(bool_or("flag")).show() |
| +-------------+ |
| |bool_or(flag)| |
| +-------------+ |
| | true| |
| +-------------+ |
| >>> df = spark.createDataFrame([[False], [False], [False]], ["flag"]) |
| >>> df.select(bool_or("flag")).show() |
| +-------------+ |
| |bool_or(flag)| |
| +-------------+ |
| | false| |
| +-------------+ |
| """ |
| return _invoke_function_over_columns("bool_or", col) |
| |
| |
| @_try_remote_functions |
| def bit_and(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the bitwise AND of all non-null input values, or null if none. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the bitwise AND of all non-null input values, or null if none. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bit_or` |
| :meth:`pyspark.sql.functions.bit_xor` |
| |
| Examples |
| -------- |
| Example 1: Bitwise AND with all non-null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) |
| >>> df.select(sf.bit_and("c")).show() |
| +----------+ |
| |bit_and(c)| |
| +----------+ |
| | 0| |
| +----------+ |
| |
| Example 2: Bitwise AND with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[None],[2]], ["c"]) |
| >>> df.select(sf.bit_and("c")).show() |
| +----------+ |
| |bit_and(c)| |
| +----------+ |
| | 0| |
| +----------+ |
| |
| Example 3: Bitwise AND with all null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import IntegerType, StructType, StructField |
| >>> schema = StructType([StructField("c", IntegerType(), True)]) |
| >>> df = spark.createDataFrame([[None],[None],[None]], schema=schema) |
| >>> df.select(sf.bit_and("c")).show() |
| +----------+ |
| |bit_and(c)| |
| +----------+ |
| | NULL| |
| +----------+ |
| |
| Example 4: Bitwise AND with single input value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[5]], ["c"]) |
| >>> df.select(sf.bit_and("c")).show() |
| +----------+ |
| |bit_and(c)| |
| +----------+ |
| | 5| |
| +----------+ |
| """ |
| return _invoke_function_over_columns("bit_and", col) |
| |
| |
| @_try_remote_functions |
| def bit_or(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the bitwise OR of all non-null input values, or null if none. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the bitwise OR of all non-null input values, or null if none. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bit_and` |
| :meth:`pyspark.sql.functions.bit_xor` |
| |
| Examples |
| -------- |
| Example 1: Bitwise OR with all non-null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) |
| >>> df.select(sf.bit_or("c")).show() |
| +---------+ |
| |bit_or(c)| |
| +---------+ |
| | 3| |
| +---------+ |
| |
| Example 2: Bitwise OR with some null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[None],[2]], ["c"]) |
| >>> df.select(sf.bit_or("c")).show() |
| +---------+ |
| |bit_or(c)| |
| +---------+ |
| | 3| |
| +---------+ |
| |
| Example 3: Bitwise OR with all null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import IntegerType, StructType, StructField |
| >>> schema = StructType([StructField("c", IntegerType(), True)]) |
| >>> df = spark.createDataFrame([[None],[None],[None]], schema=schema) |
| >>> df.select(sf.bit_or("c")).show() |
| +---------+ |
| |bit_or(c)| |
| +---------+ |
| | NULL| |
| +---------+ |
| |
| Example 4: Bitwise OR with single input value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[5]], ["c"]) |
| >>> df.select(sf.bit_or("c")).show() |
| +---------+ |
| |bit_or(c)| |
| +---------+ |
| | 5| |
| +---------+ |
| """ |
| return _invoke_function_over_columns("bit_or", col) |
| |
| |
| @_try_remote_functions |
| def bit_xor(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the bitwise XOR of all non-null input values, or null if none. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the bitwise XOR of all non-null input values, or null if none. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bit_and` |
| :meth:`pyspark.sql.functions.bit_or` |
| |
| Examples |
| -------- |
| Example 1: Bitwise XOR with all non-null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) |
| >>> df.select(sf.bit_xor("c")).show() |
| +----------+ |
| |bit_xor(c)| |
| +----------+ |
| | 2| |
| +----------+ |
| |
| Example 2: Bitwise XOR with some null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[None],[2]], ["c"]) |
| >>> df.select(sf.bit_xor("c")).show() |
| +----------+ |
| |bit_xor(c)| |
| +----------+ |
| | 3| |
| +----------+ |
| |
| Example 3: Bitwise XOR with all null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import IntegerType, StructType, StructField |
| >>> schema = StructType([StructField("c", IntegerType(), True)]) |
| >>> df = spark.createDataFrame([[None],[None],[None]], schema=schema) |
| >>> df.select(sf.bit_xor("c")).show() |
| +----------+ |
| |bit_xor(c)| |
| +----------+ |
| | NULL| |
| +----------+ |
| |
| Example 4: Bitwise XOR with single input value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[5]], ["c"]) |
| >>> df.select(sf.bit_xor("c")).show() |
| +----------+ |
| |bit_xor(c)| |
| +----------+ |
| | 5| |
| +----------+ |
| """ |
| return _invoke_function_over_columns("bit_xor", col) |
| |
| |
| @_try_remote_functions |
| def skewness(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the skewness of the values in a group. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.std` |
| :meth:`pyspark.sql.functions.stddev` |
| :meth:`pyspark.sql.functions.variance` |
| :meth:`pyspark.sql.functions.kurtosis` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| skewness of given column. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) |
| >>> df.select(sf.skewness(df.c)).show() |
| +------------------+ |
| | skewness(c)| |
| +------------------+ |
| |0.7071067811865...| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("skewness", col) |
| |
| |
| @_try_remote_functions |
| def kurtosis(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the kurtosis of the values in a group. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| kurtosis of given column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.std` |
| :meth:`pyspark.sql.functions.stddev` |
| :meth:`pyspark.sql.functions.variance` |
| :meth:`pyspark.sql.functions.skewness` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) |
| >>> df.select(sf.kurtosis(df.c)).show() |
| +-----------+ |
| |kurtosis(c)| |
| +-----------+ |
| | -1.5| |
| +-----------+ |
| """ |
| return _invoke_function_over_columns("kurtosis", col) |
| |
| |
| @_try_remote_functions |
| def collect_list(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: Collects the values from a column into a list, |
| maintaining duplicates, and returns this list of objects. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column on which the function is computed. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.array_agg` |
| :meth:`pyspark.sql.functions.collect_set` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column object representing a list of collected values, with duplicate values included. |
| |
| Notes |
| ----- |
| The function is non-deterministic as the order of collected results depends |
| on the order of the rows, which possibly becomes non-deterministic after shuffle operations. |
| |
| Examples |
| -------- |
| Example 1: Collect values from a DataFrame and sort the result in ascending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',)) |
| >>> df.select(sf.sort_array(sf.collect_list('value')).alias('sorted_list')).show() |
| +-----------+ |
| |sorted_list| |
| +-----------+ |
| | [1, 2, 2]| |
| +-----------+ |
| |
| Example 2: Collect values from a DataFrame and sort the result in descending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) |
| >>> df.select(sf.sort_array(sf.collect_list('age'), asc=False).alias('sorted_list')).show() |
| +-----------+ |
| |sorted_list| |
| +-----------+ |
| | [5, 5, 2]| |
| +-----------+ |
| |
| Example 3: Collect values from a DataFrame with multiple columns and sort the result |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) |
| >>> df = df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list')) |
| >>> df.orderBy(sf.desc("name")).show() |
| +----+-----------+ |
| |name|sorted_list| |
| +----+-----------+ |
| |John| [1, 2]| |
| | Ana| [3]| |
| +----+-----------+ |
| """ |
| return _invoke_function_over_columns("collect_list", col) |
| |
| |
| @_try_remote_functions |
| def array_agg(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns a list of objects with duplicates. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| list of objects with duplicates. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.collect_list` |
| :meth:`pyspark.sql.functions.collect_set` |
| |
| Examples |
| -------- |
| Example 1: Using array_agg function on an int column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[1],[2]], ["c"]) |
| >>> df.agg(sf.sort_array(sf.array_agg('c')).alias('sorted_list')).show() |
| +-----------+ |
| |sorted_list| |
| +-----------+ |
| | [1, 1, 2]| |
| +-----------+ |
| |
| Example 2: Using array_agg function on a string column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([["apple"],["apple"],["banana"]], ["c"]) |
| >>> df.agg(sf.sort_array(sf.array_agg('c')).alias('sorted_list')).show(truncate=False) |
| +----------------------+ |
| |sorted_list | |
| +----------------------+ |
| |[apple, apple, banana]| |
| +----------------------+ |
| |
| Example 3: Using array_agg function on a column with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],[None],[2]], ["c"]) |
| >>> df.agg(sf.sort_array(sf.array_agg('c')).alias('sorted_list')).show() |
| +-----------+ |
| |sorted_list| |
| +-----------+ |
| | [1, 2]| |
| +-----------+ |
| |
| Example 4: Using array_agg function on a column with different data types |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([[1],["apple"],[2]], ["c"]) |
| >>> df.agg(sf.sort_array(sf.array_agg('c')).alias('sorted_list')).show() |
| +-------------+ |
| | sorted_list| |
| +-------------+ |
| |[1, 2, apple]| |
| +-------------+ |
| """ |
| return _invoke_function_over_columns("array_agg", col) |
| |
| |
| @_try_remote_functions |
| def collect_set(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: Collects the values from a column into a set, |
| eliminating duplicates, and returns this set of objects. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column on which the function is computed. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column object representing a set of collected values, duplicates excluded. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.array_agg` |
| :meth:`pyspark.sql.functions.collect_list` |
| |
| Notes |
| ----- |
| This function is non-deterministic as the order of collected results depends |
| on the order of the rows, which may be non-deterministic after any shuffle operations. |
| |
| Examples |
| -------- |
| Example 1: Collect values from a DataFrame and sort the result in ascending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',)) |
| >>> df.select(sf.sort_array(sf.collect_set('value')).alias('sorted_set')).show() |
| +----------+ |
| |sorted_set| |
| +----------+ |
| | [1, 2]| |
| +----------+ |
| |
| Example 2: Collect values from a DataFrame and sort the result in descending order |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',)) |
| >>> df.select(sf.sort_array(sf.collect_set('age'), asc=False).alias('sorted_set')).show() |
| +----------+ |
| |sorted_set| |
| +----------+ |
| | [5, 2]| |
| +----------+ |
| |
| Example 3: Collect values from a DataFrame with multiple columns and sort the result |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name")) |
| >>> df = df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set')) |
| >>> df.orderBy(sf.desc("name")).show() |
| +----+----------+ |
| |name|sorted_set| |
| +----+----------+ |
| |John| [1, 2]| |
| | Ana| [3]| |
| +----+----------+ |
| """ |
| return _invoke_function_over_columns("collect_set", col) |
| |
| |
| @_try_remote_functions |
| def degrees(col: "ColumnOrName") -> Column: |
| """ |
| Converts an angle measured in radians to an approximately equivalent angle |
| measured in degrees. |
| |
| .. versionadded:: 2.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| angle in radians |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| angle in degrees, as if computed by `java.lang.Math.toDegrees()` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (0.0), (PI()), (PI() / 2), (PI() / 4) AS TAB(value)" |
| ... ).select("*", sf.degrees("value")).show() |
| +------------------+--------------+ |
| | value|DEGREES(value)| |
| +------------------+--------------+ |
| | 0.0| 0.0| |
| | 3.141592653589...| 180.0| |
| |1.5707963267948...| 90.0| |
| |0.7853981633974...| 45.0| |
| +------------------+--------------+ |
| """ |
| return _invoke_function_over_columns("degrees", col) |
| |
| |
| @_try_remote_functions |
| def radians(col: "ColumnOrName") -> Column: |
| """ |
| Converts an angle measured in degrees to an approximately equivalent angle |
| measured in radians. |
| |
| .. versionadded:: 2.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| angle in degrees |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| angle in radians, as if computed by `java.lang.Math.toRadians()` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.sql( |
| ... "SELECT * FROM VALUES (180), (90), (45), (0) AS TAB(value)" |
| ... ).select("*", sf.radians("value")).show() |
| +-----+------------------+ |
| |value| RADIANS(value)| |
| +-----+------------------+ |
| | 180| 3.141592653589...| |
| | 90|1.5707963267948...| |
| | 45|0.7853981633974...| |
| | 0| 0.0| |
| +-----+------------------+ |
| """ |
| return _invoke_function_over_columns("radians", col) |
| |
| |
| @_try_remote_functions |
| def atan2(col1: Union["ColumnOrName", float], col2: Union["ColumnOrName", float]) -> Column: |
| """ |
| Compute the angle in radians between the positive x-axis of a plane |
| and the point given by the coordinates |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column`, column name or float |
| coordinate on y-axis |
| col2 : :class:`~pyspark.sql.Column`, column name or float |
| coordinate on x-axis |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the `theta` component of the point |
| (`r`, `theta`) |
| in polar coordinates that corresponds to the point |
| (`x`, `y`) in Cartesian coordinates, |
| as if computed by `java.lang.Math.atan2()` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.atan2(sf.lit(1), sf.lit(2))).show() |
| +------------------+ |
| | ATAN2(1, 2)| |
| +------------------+ |
| |0.4636476090008...| |
| +------------------+ |
| """ |
| return _invoke_binary_math_function("atan2", col1, col2) |
| |
| |
| @_try_remote_functions |
| def hypot(col1: Union["ColumnOrName", float], col2: Union["ColumnOrName", float]) -> Column: |
| """ |
| Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column`, column name or float |
| a leg. |
| col2 : :class:`~pyspark.sql.Column`, column name or float |
| b leg. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| length of the hypotenuse. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.hypot(sf.lit(1), sf.lit(2))).show() |
| +----------------+ |
| | HYPOT(1, 2)| |
| +----------------+ |
| |2.23606797749...| |
| +----------------+ |
| """ |
| return _invoke_binary_math_function("hypot", col1, col2) |
| |
| |
| @_try_remote_functions |
| def pow(col1: Union["ColumnOrName", float], col2: Union["ColumnOrName", float]) -> Column: |
| """ |
| Returns the value of the first argument raised to the power of the second argument. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column`, column name or float |
| the base number. |
| col2 : :class:`~pyspark.sql.Column`, column name or float |
| the exponent number. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the base rased to the power the argument. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(5).select("*", sf.pow("id", 2)).show() |
| +---+------------+ |
| | id|POWER(id, 2)| |
| +---+------------+ |
| | 0| 0.0| |
| | 1| 1.0| |
| | 2| 4.0| |
| | 3| 9.0| |
| | 4| 16.0| |
| +---+------------+ |
| """ |
| return _invoke_binary_math_function("pow", col1, col2) |
| |
| |
| power = pow |
| |
| |
| @_try_remote_functions |
| def pmod(dividend: Union["ColumnOrName", float], divisor: Union["ColumnOrName", float]) -> Column: |
| """ |
| Returns the positive value of dividend mod divisor. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| dividend : :class:`~pyspark.sql.Column`, column name or float |
| the column that contains dividend, or the specified dividend value |
| divisor : :class:`~pyspark.sql.Column`, column name or float |
| the column that contains divisor, or the specified divisor value |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| positive value of dividend mod divisor. |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (1.0, float('nan')), (float('nan'), 2.0), (10.0, 3.0), |
| ... (float('nan'), float('nan')), (-3.0, 4.0), (-10.0, 3.0), |
| ... (-5.0, -6.0), (7.0, -8.0), (1.0, 2.0)], |
| ... ("a", "b")) |
| >>> df.select("*", sf.pmod("a", "b")).show() |
| +-----+----+----------+ |
| | a| b|pmod(a, b)| |
| +-----+----+----------+ |
| | 1.0| NaN| NaN| |
| | NaN| 2.0| NaN| |
| | 10.0| 3.0| 1.0| |
| | NaN| NaN| NaN| |
| | -3.0| 4.0| 1.0| |
| |-10.0| 3.0| 2.0| |
| | -5.0|-6.0| -5.0| |
| | 7.0|-8.0| 7.0| |
| | 1.0| 2.0| 1.0| |
| +-----+----+----------+ |
| """ |
| return _invoke_binary_math_function("pmod", dividend, divisor) |
| |
| |
| @_try_remote_functions |
| def width_bucket( |
| v: "ColumnOrName", |
| min: "ColumnOrName", |
| max: "ColumnOrName", |
| numBucket: Union["ColumnOrName", int], |
| ) -> Column: |
| """ |
| Returns the bucket number into which the value of this expression would fall |
| after being evaluated. Note that input arguments must follow conditions listed below; |
| otherwise, the method will return null. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| v : :class:`~pyspark.sql.Column` or column name |
| value to compute a bucket number in the histogram |
| min : :class:`~pyspark.sql.Column` or column name |
| minimum value of the histogram |
| max : :class:`~pyspark.sql.Column` or column name |
| maximum value of the histogram |
| numBucket : :class:`~pyspark.sql.Column`, column name or int |
| the number of buckets |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the bucket number into which the value would fall after being evaluated |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (5.3, 0.2, 10.6, 5), |
| ... (-2.1, 1.3, 3.4, 3), |
| ... (8.1, 0.0, 5.7, 4), |
| ... (-0.9, 5.2, 0.5, 2)], |
| ... ['v', 'min', 'max', 'n']) |
| >>> df.select("*", sf.width_bucket('v', 'min', 'max', 'n')).show() |
| +----+---+----+---+----------------------------+ |
| | v|min| max| n|width_bucket(v, min, max, n)| |
| +----+---+----+---+----------------------------+ |
| | 5.3|0.2|10.6| 5| 3| |
| |-2.1|1.3| 3.4| 3| 0| |
| | 8.1|0.0| 5.7| 4| 5| |
| |-0.9|5.2| 0.5| 2| 3| |
| +----+---+----+---+----------------------------+ |
| """ |
| numBucket = _enum_to_value(numBucket) |
| numBucket = lit(numBucket) if isinstance(numBucket, int) else numBucket |
| return _invoke_function_over_columns("width_bucket", v, min, max, numBucket) |
| |
| |
| @_try_remote_functions |
| def row_number() -> Column: |
| """ |
| Window function: returns a sequential number starting at 1 within a window partition. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for calculating row numbers. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.range(3) |
| >>> w = Window.orderBy(df.id.desc()) |
| >>> df.withColumn("desc_order", sf.row_number().over(w)).show() |
| +---+----------+ |
| | id|desc_order| |
| +---+----------+ |
| | 2| 1| |
| | 1| 2| |
| | 0| 3| |
| +---+----------+ |
| """ |
| return _invoke_function("row_number") |
| |
| |
| @_try_remote_functions |
| def dense_rank() -> Column: |
| """ |
| Window function: returns the rank of rows within a window partition, without any gaps. |
| |
| The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking |
| sequence when there are ties. That is, if you were ranking a competition using dense_rank |
| and had three people tie for second place, you would say that all three were in second |
| place and that the next person came in third. Rank would give me sequential numbers, making |
| the person that came in third place (after the ties) would register as coming in fifth. |
| |
| This is equivalent to the DENSE_RANK function in SQL. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for calculating ranks. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], "int") |
| >>> w = Window.orderBy("value") |
| >>> df.withColumn("drank", sf.dense_rank().over(w)).show() |
| +-----+-----+ |
| |value|drank| |
| +-----+-----+ |
| | 1| 1| |
| | 1| 1| |
| | 2| 2| |
| | 3| 3| |
| | 3| 3| |
| | 4| 4| |
| +-----+-----+ |
| """ |
| return _invoke_function("dense_rank") |
| |
| |
| @_try_remote_functions |
| def rank() -> Column: |
| """ |
| Window function: returns the rank of rows within a window partition. |
| |
| The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking |
| sequence when there are ties. That is, if you were ranking a competition using dense_rank |
| and had three people tie for second place, you would say that all three were in second |
| place and that the next person came in third. Rank would give me sequential numbers, making |
| the person that came in third place (after the ties) would register as coming in fifth. |
| |
| This is equivalent to the RANK function in SQL. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for calculating ranks. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], "int") |
| >>> w = Window.orderBy("value") |
| >>> df.withColumn("drank", sf.rank().over(w)).show() |
| +-----+-----+ |
| |value|drank| |
| +-----+-----+ |
| | 1| 1| |
| | 1| 1| |
| | 2| 3| |
| | 3| 4| |
| | 3| 4| |
| | 4| 6| |
| +-----+-----+ |
| """ |
| return _invoke_function("rank") |
| |
| |
| @_try_remote_functions |
| def cume_dist() -> Column: |
| """ |
| Window function: returns the cumulative distribution of values within a window partition, |
| i.e. the fraction of rows that are below the current row. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for calculating cumulative distribution. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame([1, 2, 3, 3, 4], "int") |
| >>> w = Window.orderBy("value") |
| >>> df.withColumn("cd", sf.cume_dist().over(w)).show() |
| +-----+---+ |
| |value| cd| |
| +-----+---+ |
| | 1|0.2| |
| | 2|0.4| |
| | 3|0.8| |
| | 3|0.8| |
| | 4|1.0| |
| +-----+---+ |
| """ |
| return _invoke_function("cume_dist") |
| |
| |
| @_try_remote_functions |
| def percent_rank() -> Column: |
| """ |
| Window function: returns the relative rank (i.e. percentile) of rows within a window partition. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for calculating relative rank. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], "int") |
| >>> w = Window.orderBy("value") |
| >>> df.withColumn("pr", sf.percent_rank().over(w)).show() |
| +-----+---+ |
| |value| pr| |
| +-----+---+ |
| | 1|0.0| |
| | 1|0.0| |
| | 2|0.4| |
| | 3|0.6| |
| | 3|0.6| |
| | 4|1.0| |
| +-----+---+ |
| """ |
| return _invoke_function("percent_rank") |
| |
| |
| @_try_remote_functions |
| def approxCountDistinct(col: "ColumnOrName", rsd: Optional[float] = None) -> Column: |
| """ |
| This aggregate function returns a new :class:`~pyspark.sql.Column`, which estimates |
| the approximate distinct count of elements in a specified column or a group of columns. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 2.1.0 |
| Use :func:`approx_count_distinct` instead. |
| """ |
| warnings.warn("Deprecated in 2.1, use approx_count_distinct instead.", FutureWarning) |
| return approx_count_distinct(col, rsd) |
| |
| |
| @_try_remote_functions |
| def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> Column: |
| """ |
| This aggregate function returns a new :class:`~pyspark.sql.Column`, which estimates |
| the approximate distinct count of elements in a specified column or a group of columns. |
| |
| .. versionadded:: 2.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The label of the column to count distinct values in. |
| rsd : float, optional |
| The maximum allowed relative standard deviation (default = 0.05). |
| If rsd < 0.01, it would be more efficient to use :func:`count_distinct`. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column object representing the approximate unique count. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.count_distinct` |
| |
| Examples |
| -------- |
| Example 1: Counting distinct values in a single column DataFrame representing integers |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([1,2,2,3], "int") |
| >>> df.agg(sf.approx_count_distinct("value")).show() |
| +----------------------------+ |
| |approx_count_distinct(value)| |
| +----------------------------+ |
| | 3| |
| +----------------------------+ |
| |
| Example 2: Counting distinct values in a single column DataFrame representing strings |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("apple",), ("orange",), ("apple",), ("banana",)], ['fruit']) |
| >>> df.agg(sf.approx_count_distinct("fruit")).show() |
| +----------------------------+ |
| |approx_count_distinct(fruit)| |
| +----------------------------+ |
| | 3| |
| +----------------------------+ |
| |
| Example 3: Counting distinct values in a DataFrame with multiple columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("Alice", 1), ("Alice", 2), ("Bob", 3), ("Bob", 3)], ["name", "value"]) |
| >>> df = df.withColumn("combined", sf.struct("name", "value")) |
| >>> df.agg(sf.approx_count_distinct(df.combined)).show() |
| +-------------------------------+ |
| |approx_count_distinct(combined)| |
| +-------------------------------+ |
| | 3| |
| +-------------------------------+ |
| |
| Example 4: Counting distinct values with a specified relative standard deviation |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(100000).agg( |
| ... sf.approx_count_distinct("id").alias('with_default_rsd'), |
| ... sf.approx_count_distinct("id", 0.1).alias('with_rsd_0.1') |
| ... ).show() |
| +----------------+------------+ |
| |with_default_rsd|with_rsd_0.1| |
| +----------------+------------+ |
| | 95546| 102065| |
| +----------------+------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if rsd is None: |
| return _invoke_function_over_columns("approx_count_distinct", col) |
| else: |
| return _invoke_function("approx_count_distinct", _to_java_column(col), _enum_to_value(rsd)) |
| |
| |
| @_try_remote_functions |
| def broadcast(df: "DataFrame") -> "DataFrame": |
| """ |
| Marks a DataFrame as small enough for use in broadcast joins. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.DataFrame` |
| DataFrame marked as ready for broadcast join. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([1, 2, 3, 3, 4], "int") |
| >>> df_small = spark.range(3) |
| >>> df_b = sf.broadcast(df_small) |
| >>> df.join(df_b, df.value == df_small.id).show() |
| +-----+---+ |
| |value| id| |
| +-----+---+ |
| | 1| 1| |
| | 2| 2| |
| +-----+---+ |
| """ |
| from py4j.java_gateway import JVMView |
| from pyspark.sql.dataframe import DataFrame |
| |
| sc = _get_active_spark_context() |
| return DataFrame(cast(JVMView, sc._jvm).functions.broadcast(df._jdf), df.sparkSession) |
| |
| |
| @_try_remote_functions |
| def coalesce(*cols: "ColumnOrName") -> Column: |
| """Returns the first column that is not null. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| list of columns to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value of the first column that is not null. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) |
| >>> df.show() |
| +----+----+ |
| | a| b| |
| +----+----+ |
| |NULL|NULL| |
| | 1|NULL| |
| |NULL| 2| |
| +----+----+ |
| |
| >>> df.select('*', sf.coalesce("a", df["b"])).show() |
| +----+----+--------------+ |
| | a| b|coalesce(a, b)| |
| +----+----+--------------+ |
| |NULL|NULL| NULL| |
| | 1|NULL| 1| |
| |NULL| 2| 2| |
| +----+----+--------------+ |
| |
| >>> df.select('*', sf.coalesce(df["a"], lit(0.0))).show() |
| +----+----+----------------+ |
| | a| b|coalesce(a, 0.0)| |
| +----+----+----------------+ |
| |NULL|NULL| 0.0| |
| | 1|NULL| 1.0| |
| |NULL| 2| 0.0| |
| +----+----+----------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("coalesce", cols) |
| |
| |
| @_try_remote_functions |
| def corr(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """Returns a new :class:`~pyspark.sql.Column` for the Pearson Correlation Coefficient for |
| ``col1`` and ``col2``. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| first column to calculate correlation. |
| col2 : :class:`~pyspark.sql.Column` or column name |
| second column to calculate correlation. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Pearson Correlation Coefficient of these two column values. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> a = range(20) |
| >>> b = [2 * x for x in range(20)] |
| >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) |
| >>> df.agg(sf.corr("a", df.b)).show() |
| +----------+ |
| |corr(a, b)| |
| +----------+ |
| | 1.0| |
| +----------+ |
| """ |
| return _invoke_function_over_columns("corr", col1, col2) |
| |
| |
| @_try_remote_functions |
| def covar_pop(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """Returns a new :class:`~pyspark.sql.Column` for the population covariance of ``col1`` and |
| ``col2``. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| first column to calculate covariance. |
| col2 : :class:`~pyspark.sql.Column` or column name |
| second column to calculate covariance. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| covariance of these two column values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.covar_samp` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> a = [1] * 10 |
| >>> b = [1] * 10 |
| >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) |
| >>> df.agg(sf.covar_pop("a", df.b)).show() |
| +---------------+ |
| |covar_pop(a, b)| |
| +---------------+ |
| | 0.0| |
| +---------------+ |
| """ |
| return _invoke_function_over_columns("covar_pop", col1, col2) |
| |
| |
| @_try_remote_functions |
| def covar_samp(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """Returns a new :class:`~pyspark.sql.Column` for the sample covariance of ``col1`` and |
| ``col2``. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| first column to calculate covariance. |
| col2 : :class:`~pyspark.sql.Column` or column name |
| second column to calculate covariance. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| sample covariance of these two column values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.covar_pop` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> a = [1] * 10 |
| >>> b = [1] * 10 |
| >>> df = spark.createDataFrame(zip(a, b), ["a", "b"]) |
| >>> df.agg(sf.covar_samp("a", df.b)).show() |
| +----------------+ |
| |covar_samp(a, b)| |
| +----------------+ |
| | 0.0| |
| +----------------+ |
| """ |
| return _invoke_function_over_columns("covar_samp", col1, col2) |
| |
| |
| @_try_remote_functions |
| def countDistinct(col: "ColumnOrName", *cols: "ColumnOrName") -> Column: |
| """Returns a new :class:`~pyspark.sql.Column` for distinct count of ``col`` or ``cols``. |
| |
| An alias of :func:`count_distinct`, and it is encouraged to use :func:`count_distinct` |
| directly. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (1,), (3,)], ["value"]) |
| >>> df.select(sf.count_distinct(df.value)).show() |
| +---------------------+ |
| |count(DISTINCT value)| |
| +---------------------+ |
| | 2| |
| +---------------------+ |
| |
| >>> df.select(sf.countDistinct(df.value)).show() |
| +---------------------+ |
| |count(DISTINCT value)| |
| +---------------------+ |
| | 2| |
| +---------------------+ |
| """ |
| return count_distinct(col, *cols) |
| |
| |
| @_try_remote_functions |
| def count_distinct(col: "ColumnOrName", *cols: "ColumnOrName") -> Column: |
| """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| first column to compute on. |
| cols : :class:`~pyspark.sql.Column` or column name |
| other columns to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| distinct values of these two column values. |
| |
| Examples |
| -------- |
| Example 1: Counting distinct values of a single column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (1,), (3,)], ["value"]) |
| >>> df.select(sf.count_distinct(df.value)).show() |
| +---------------------+ |
| |count(DISTINCT value)| |
| +---------------------+ |
| | 2| |
| +---------------------+ |
| |
| Example 2: Counting distinct values of multiple columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 1), (1, 2)], ["value1", "value2"]) |
| >>> df.select(sf.count_distinct(df.value1, df.value2)).show() |
| +------------------------------+ |
| |count(DISTINCT value1, value2)| |
| +------------------------------+ |
| | 2| |
| +------------------------------+ |
| |
| Example 3: Counting distinct values with column names as strings |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 1), (1, 2)], ["value1", "value2"]) |
| >>> df.select(sf.count_distinct("value1", "value2")).show() |
| +------------------------------+ |
| |count(DISTINCT value1, value2)| |
| +------------------------------+ |
| | 2| |
| +------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function( |
| "count_distinct", _to_java_column(col), _to_seq(sc, cols, _to_java_column) |
| ) |
| |
| |
| @_try_remote_functions |
| def first(col: "ColumnOrName", ignorenulls: bool = False) -> Column: |
| """Aggregate function: returns the first value in a group. |
| |
| The function by default returns the first values it sees. It will return the first non-null |
| value it sees when ignoreNulls is set to true. If all values are null, then null is returned. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic because its results depends on the order of the |
| rows which may be non-deterministic after a shuffle. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to fetch first value for. |
| ignorenulls : bool |
| if first value is null then look for first non-null value. ``False``` by default. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| first value of the group. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age")) |
| >>> df = df.orderBy(df.age) |
| >>> df.groupby("name").agg(sf.first("age")).orderBy("name").show() |
| +-----+----------+ |
| | name|first(age)| |
| +-----+----------+ |
| |Alice| NULL| |
| | Bob| 5| |
| +-----+----------+ |
| |
| To ignore any null values, set ``ignorenulls`` to `True` |
| |
| >>> df.groupby("name").agg(sf.first("age", ignorenulls=True)).orderBy("name").show() |
| +-----+----------+ |
| | name|first(age)| |
| +-----+----------+ |
| |Alice| 2| |
| | Bob| 5| |
| +-----+----------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("first", _to_java_column(col), _enum_to_value(ignorenulls)) |
| |
| |
| @_try_remote_functions |
| def grouping(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated |
| or not, returns 1 for aggregated or 0 for not aggregated in the result set. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to check if it's aggregated. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| returns 1 for aggregated or 0 for not aggregated in the result set. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) |
| >>> df.cube("name").agg(sf.grouping("name"), sf.sum("age")).orderBy("name").show() |
| +-----+--------------+--------+ |
| | name|grouping(name)|sum(age)| |
| +-----+--------------+--------+ |
| | NULL| 1| 7| |
| |Alice| 0| 2| |
| | Bob| 0| 5| |
| +-----+--------------+--------+ |
| """ |
| return _invoke_function_over_columns("grouping", col) |
| |
| |
| @_try_remote_functions |
| def grouping_id(*cols: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the level of grouping, equals to |
| |
| (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn) |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The list of columns should match with grouping columns exactly, or empty (means all |
| the grouping columns). |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| columns to check for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| returns level of the grouping it relates to. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(1, "a", "a"), (3, "a", "a"), (4, "b", "c")], ["c1", "c2", "c3"]) |
| >>> df.cube("c2", "c3").agg(sf.grouping_id(), sf.sum("c1")).orderBy("c2", "c3").show() |
| +----+----+-------------+-------+ |
| | c2| c3|grouping_id()|sum(c1)| |
| +----+----+-------------+-------+ |
| |NULL|NULL| 3| 8| |
| |NULL| a| 2| 4| |
| |NULL| c| 2| 4| |
| | a|NULL| 1| 4| |
| | a| a| 0| 4| |
| | b|NULL| 1| 4| |
| | b| c| 0| 4| |
| +----+----+-------------+-------+ |
| """ |
| return _invoke_function_over_seq_of_columns("grouping_id", cols) |
| |
| |
| @_try_remote_functions |
| def count_min_sketch( |
| col: "ColumnOrName", |
| eps: Union[Column, float], |
| confidence: Union[Column, float], |
| seed: Optional[Union[Column, int]] = None, |
| ) -> Column: |
| """ |
| Returns a count-min sketch of a column with the given esp, confidence and seed. |
| The result is an array of bytes, which can be deserialized to a `CountMinSketch` before usage. |
| Count-min sketch is a probabilistic data structure used for cardinality estimation |
| using sub-linear space. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| eps : :class:`~pyspark.sql.Column` or float |
| relative error, must be positive |
| |
| .. versionchanged:: 4.0.0 |
| `eps` now accepts float value. |
| |
| confidence : :class:`~pyspark.sql.Column` or float |
| confidence, must be positive and less than 1.0 |
| |
| .. versionchanged:: 4.0.0 |
| `confidence` now accepts float value. |
| |
| seed : :class:`~pyspark.sql.Column` or int, optional |
| random seed |
| |
| .. versionchanged:: 4.0.0 |
| `seed` now accepts int value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| count-min sketch of the column |
| |
| Examples |
| -------- |
| Example 1: Using columns as arguments |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(100).select( |
| ... sf.hex(sf.count_min_sketch(sf.col("id"), sf.lit(3.0), sf.lit(0.1), sf.lit(1))) |
| ... ).show(truncate=False) |
| +------------------------------------------------------------------------+ |
| |hex(count_min_sketch(id, 3.0, 0.1, 1)) | |
| +------------------------------------------------------------------------+ |
| |0000000100000000000000640000000100000001000000005D8D6AB90000000000000064| |
| +------------------------------------------------------------------------+ |
| |
| Example 2: Using numbers as arguments |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(100).select( |
| ... sf.hex(sf.count_min_sketch("id", 1.0, 0.3, 2)) |
| ... ).show(truncate=False) |
| +----------------------------------------------------------------------------------------+ |
| |hex(count_min_sketch(id, 1.0, 0.3, 2)) | |
| +----------------------------------------------------------------------------------------+ |
| |0000000100000000000000640000000100000002000000005D96391C00000000000000320000000000000032| |
| +----------------------------------------------------------------------------------------+ |
| |
| Example 3: Using a long seed |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(100).select( |
| ... sf.hex(sf.count_min_sketch("id", sf.lit(1.5), 0.2, 1111111111111111111)) |
| ... ).show(truncate=False) |
| +----------------------------------------------------------------------------------------+ |
| |hex(count_min_sketch(id, 1.5, 0.2, 1111111111111111111)) | |
| +----------------------------------------------------------------------------------------+ |
| |00000001000000000000006400000001000000020000000044078BA100000000000000320000000000000032| |
| +----------------------------------------------------------------------------------------+ |
| |
| Example 4: Using a random seed |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(100).select( |
| ... sf.hex(sf.count_min_sketch("id", sf.lit(1.5), 0.6)) |
| ... ).show(truncate=False) # doctest: +SKIP |
| +----------------------------------------------------------------------------------------------------------------------------------------+ |
| |hex(count_min_sketch(id, 1.5, 0.6, 2120704260)) | |
| +----------------------------------------------------------------------------------------------------------------------------------------+ |
| |0000000100000000000000640000000200000002000000005ADECCEE00000000153EBE090000000000000033000000000000003100000000000000320000000000000032| |
| +----------------------------------------------------------------------------------------------------------------------------------------+ |
| """ # noqa: E501 |
| _eps = lit(eps) |
| _conf = lit(confidence) |
| if seed is None: |
| return _invoke_function_over_columns("count_min_sketch", col, _eps, _conf) |
| else: |
| return _invoke_function_over_columns("count_min_sketch", col, _eps, _conf, lit(seed)) |
| |
| |
| @_try_remote_functions |
| def input_file_name() -> Column: |
| """ |
| Creates a string column for the file name of the current Spark task. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| file names. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.input_file_block_length` |
| :meth:`pyspark.sql.functions.input_file_block_start` |
| |
| Examples |
| -------- |
| >>> import os |
| >>> from pyspark.sql import functions as sf |
| >>> path = os.path.abspath(__file__) |
| >>> df = spark.read.text(path) |
| >>> df.select(sf.input_file_name()).first() |
| Row(input_file_name()='file:///...') |
| """ |
| return _invoke_function("input_file_name") |
| |
| |
| @_try_remote_functions |
| def isnan(col: "ColumnOrName") -> Column: |
| """An expression that returns true if the column is NaN. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| True if value is NaN and False otherwise. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) |
| >>> df.select("*", sf.isnan("a"), sf.isnan(df.b)).show() |
| +---+---+--------+--------+ |
| | a| b|isnan(a)|isnan(b)| |
| +---+---+--------+--------+ |
| |1.0|NaN| false| true| |
| |NaN|2.0| true| false| |
| +---+---+--------+--------+ |
| """ |
| return _invoke_function_over_columns("isnan", col) |
| |
| |
| @_try_remote_functions |
| def isnull(col: "ColumnOrName") -> Column: |
| """An expression that returns true if the column is null. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| True if value is null and False otherwise. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b")) |
| >>> df.select("*", sf.isnull("a"), isnull(df.b)).show() |
| +----+----+-----------+-----------+ |
| | a| b|(a IS NULL)|(b IS NULL)| |
| +----+----+-----------+-----------+ |
| | 1|NULL| false| true| |
| |NULL| 2| true| false| |
| +----+----+-----------+-----------+ |
| """ |
| return _invoke_function_over_columns("isnull", col) |
| |
| |
| @_try_remote_functions |
| def last(col: "ColumnOrName", ignorenulls: bool = False) -> Column: |
| """Aggregate function: returns the last value in a group. |
| |
| The function by default returns the last values it sees. It will return the last non-null |
| value it sees when ignoreNulls is set to true. If all values are null, then null is returned. |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic because its results depends on the order of the |
| rows which may be non-deterministic after a shuffle. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column to fetch last value for. |
| ignorenulls : bool |
| if last value is null then look for non-null value. ``False``` by default. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| last value of the group. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age")) |
| >>> df = df.orderBy(df.age.desc()) |
| >>> df.groupby("name").agg(sf.last("age")).orderBy("name").show() |
| +-----+---------+ |
| | name|last(age)| |
| +-----+---------+ |
| |Alice| NULL| |
| | Bob| 5| |
| +-----+---------+ |
| |
| To ignore any null values, set ``ignorenulls`` to `True` |
| |
| >>> df.groupby("name").agg(sf.last("age", ignorenulls=True)).orderBy("name").show() |
| +-----+---------+ |
| | name|last(age)| |
| +-----+---------+ |
| |Alice| 2| |
| | Bob| 5| |
| +-----+---------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("last", _to_java_column(col), _enum_to_value(ignorenulls)) |
| |
| |
| @_try_remote_functions |
| def monotonically_increasing_id() -> Column: |
| """A column that generates monotonically increasing 64-bit integers. |
| |
| The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive. |
| The current implementation puts the partition ID in the upper 31 bits, and the record number |
| within each partition in the lower 33 bits. The assumption is that the data frame has |
| less than 1 billion partitions, and each partition has less than 8 billion records. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic because its result depends on partition IDs. |
| |
| As an example, consider a :class:`DataFrame` with two partitions, each with 3 records. |
| This expression would return the following IDs: |
| 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| last value of the group. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(0, 10, 1, 2).select( |
| ... "*", |
| ... sf.spark_partition_id(), |
| ... sf.monotonically_increasing_id()).show() |
| +---+--------------------+-----------------------------+ |
| | id|SPARK_PARTITION_ID()|monotonically_increasing_id()| |
| +---+--------------------+-----------------------------+ |
| | 0| 0| 0| |
| | 1| 0| 1| |
| | 2| 0| 2| |
| | 3| 0| 3| |
| | 4| 0| 4| |
| | 5| 1| 8589934592| |
| | 6| 1| 8589934593| |
| | 7| 1| 8589934594| |
| | 8| 1| 8589934595| |
| | 9| 1| 8589934596| |
| +---+--------------------+-----------------------------+ |
| """ |
| return _invoke_function("monotonically_increasing_id") |
| |
| |
| @_try_remote_functions |
| def nanvl(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """Returns col1 if it is not NaN, or col2 if col1 is NaN. |
| |
| Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`). |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| first column to check. |
| col2 : :class:`~pyspark.sql.Column` or column name |
| second column to return if first is NaN. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value from first column or second if first is NaN . |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b")) |
| >>> df.select("*", sf.nanvl("a", "b"), sf.nanvl(df.a, df.b)).show() |
| +---+---+-----------+-----------+ |
| | a| b|nanvl(a, b)|nanvl(a, b)| |
| +---+---+-----------+-----------+ |
| |1.0|NaN| 1.0| 1.0| |
| |NaN|2.0| 2.0| 2.0| |
| +---+---+-----------+-----------+ |
| """ |
| return _invoke_function_over_columns("nanvl", col1, col2) |
| |
| |
| @_try_remote_functions |
| def percentile( |
| col: "ColumnOrName", |
| percentage: Union[Column, float, Sequence[float], Tuple[float, ...]], |
| frequency: Union[Column, int] = 1, |
| ) -> Column: |
| """Returns the exact percentile(s) of numeric column `expr` at the given percentage(s) |
| with value range in [0.0, 1.0]. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats |
| percentage in decimal (must be between 0.0 and 1.0). |
| frequency : :class:`~pyspark.sql.Column` or int is a positive numeric literal which |
| controls frequency. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the exact `percentile` of the numeric column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.median` |
| :meth:`pyspark.sql.functions.approx_percentile` |
| :meth:`pyspark.sql.functions.percentile_approx` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> key = (sf.col("id") % 3).alias("key") |
| >>> value = (sf.randn(42) + key * 10).alias("value") |
| >>> df = spark.range(0, 1000, 1, 1).select(key, value) |
| >>> df.select( |
| ... sf.percentile("value", [0.25, 0.5, 0.75], sf.lit(1)) |
| ... ).show(truncate=False) |
| +--------------------------------------------------------+ |
| |percentile(value, array(0.25, 0.5, 0.75), 1) | |
| +--------------------------------------------------------+ |
| |[0.7441991494121..., 9.9900713756..., 19.33740203080...]| |
| +--------------------------------------------------------+ |
| |
| >>> df.groupBy("key").agg( |
| ... sf.percentile("value", sf.lit(0.5), sf.lit(1)) |
| ... ).sort("key").show() |
| +---+-------------------------+ |
| |key|percentile(value, 0.5, 1)| |
| +---+-------------------------+ |
| | 0| -0.03449962216667901| |
| | 1| 9.990389751837329| |
| | 2| 19.967859769284075| |
| +---+-------------------------+ |
| """ |
| percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) |
| return _invoke_function_over_columns("percentile", col, percentage, lit(frequency)) |
| |
| |
| @_try_remote_functions |
| def percentile_approx( |
| col: "ColumnOrName", |
| percentage: Union[Column, float, Sequence[float], Tuple[float, ...]], |
| accuracy: Union[Column, int] = 10000, |
| ) -> Column: |
| """Returns the approximate `percentile` of the numeric column `col` which is the smallest value |
| in the ordered `col` values (sorted from least to greatest) such that no more than `percentage` |
| of `col` values is less than the value or equal to that value. |
| |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column. |
| percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats |
| percentage in decimal (must be between 0.0 and 1.0). |
| When percentage is an array, each value of the percentage array must be between 0.0 and 1.0. |
| In this case, returns the approximate percentile array of column col |
| at the given percentage array. |
| accuracy : :class:`~pyspark.sql.Column` or int |
| is a positive numeric literal which controls approximation accuracy |
| at the cost of memory. Higher value of accuracy yields better accuracy, |
| 1.0/accuracy is the relative error of the approximation. (default: 10000). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| approximate `percentile` of the numeric column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.median` |
| :meth:`pyspark.sql.functions.percentile` |
| :meth:`pyspark.sql.functions.approx_percentile` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> key = (sf.col("id") % 3).alias("key") |
| >>> value = (sf.randn(42) + key * 10).alias("value") |
| >>> df = spark.range(0, 1000, 1, 1).select(key, value) |
| >>> df.select( |
| ... sf.percentile_approx("value", [0.25, 0.5, 0.75], 1000000) |
| ... ).show(truncate=False) |
| +----------------------------------------------------------+ |
| |percentile_approx(value, array(0.25, 0.5, 0.75), 1000000) | |
| +----------------------------------------------------------+ |
| |[0.7264430125286..., 9.98975299938..., 19.335304783039...]| |
| +----------------------------------------------------------+ |
| |
| >>> df.groupBy("key").agg( |
| ... sf.percentile_approx("value", sf.lit(0.5), sf.lit(1000000)) |
| ... ).sort("key").show() |
| +---+--------------------------------------+ |
| |key|percentile_approx(value, 0.5, 1000000)| |
| +---+--------------------------------------+ |
| | 0| -0.03519435193070...| |
| | 1| 9.990389751837...| |
| | 2| 19.967859769284...| |
| +---+--------------------------------------+ |
| """ |
| percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) |
| return _invoke_function_over_columns("percentile_approx", col, percentage, lit(accuracy)) |
| |
| |
| @_try_remote_functions |
| def approx_percentile( |
| col: "ColumnOrName", |
| percentage: Union[Column, float, Sequence[float], Tuple[float, ...]], |
| accuracy: Union[Column, int] = 10000, |
| ) -> Column: |
| """Returns the approximate `percentile` of the numeric column `col` which is the smallest value |
| in the ordered `col` values (sorted from least to greatest) such that no more than `percentage` |
| of `col` values is less than the value or equal to that value. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column. |
| percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats |
| percentage in decimal (must be between 0.0 and 1.0). |
| When percentage is an array, each value of the percentage array must be between 0.0 and 1.0. |
| In this case, returns the approximate percentile array of column col |
| at the given percentage array. |
| accuracy : :class:`~pyspark.sql.Column` or int |
| is a positive numeric literal which controls approximation accuracy |
| at the cost of memory. Higher value of accuracy yields better accuracy, |
| 1.0/accuracy is the relative error of the approximation. (default: 10000). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| approximate `percentile` of the numeric column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.median` |
| :meth:`pyspark.sql.functions.percentile` |
| :meth:`pyspark.sql.functions.percentile_approx` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> key = (sf.col("id") % 3).alias("key") |
| >>> value = (sf.randn(42) + key * 10).alias("value") |
| >>> df = spark.range(0, 1000, 1, 1).select(key, value) |
| >>> df.select( |
| ... sf.approx_percentile("value", [0.25, 0.5, 0.75], 1000000) |
| ... ).show(truncate=False) |
| +----------------------------------------------------------+ |
| |approx_percentile(value, array(0.25, 0.5, 0.75), 1000000) | |
| +----------------------------------------------------------+ |
| |[0.7264430125286..., 9.98975299938..., 19.335304783039...]| |
| +----------------------------------------------------------+ |
| |
| >>> df.groupBy("key").agg( |
| ... sf.approx_percentile("value", sf.lit(0.5), sf.lit(1000000)) |
| ... ).sort("key").show() |
| +---+--------------------------------------+ |
| |key|approx_percentile(value, 0.5, 1000000)| |
| +---+--------------------------------------+ |
| | 0| -0.03519435193070...| |
| | 1| 9.990389751837...| |
| | 2| 19.967859769284...| |
| +---+--------------------------------------+ |
| """ |
| percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) |
| return _invoke_function_over_columns("approx_percentile", col, percentage, lit(accuracy)) |
| |
| |
| @_try_remote_functions |
| def rand(seed: Optional[int] = None) -> Column: |
| """Generates a random column with independent and identically distributed (i.i.d.) samples |
| uniformly distributed in [0.0, 1.0). |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic in general case. |
| |
| Parameters |
| ---------- |
| seed : int, optional |
| Seed value for the random generator. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column of random values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.randn` |
| :meth:`pyspark.sql.functions.randstr` |
| :meth:`pyspark.sql.functions.uniform` |
| |
| Examples |
| -------- |
| Example 1: Generate a random column without a seed |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(0, 2, 1, 1).select("*", sf.rand()).show() # doctest: +SKIP |
| +---+-------------------------+ |
| | id|rand(-158884697681280011)| |
| +---+-------------------------+ |
| | 0| 0.9253464547887...| |
| | 1| 0.6533254118758...| |
| +---+-------------------------+ |
| |
| Example 2: Generate a random column with a specific seed |
| |
| >>> spark.range(0, 2, 1, 1).select("*", sf.rand(seed=42)).show() |
| +---+------------------+ |
| | id| rand(42)| |
| +---+------------------+ |
| | 0| 0.619189370225...| |
| | 1|0.5096018842446...| |
| +---+------------------+ |
| """ |
| if seed is not None: |
| return _invoke_function("rand", _enum_to_value(seed)) |
| else: |
| return _invoke_function("rand") |
| |
| |
| random = rand |
| |
| |
| @_try_remote_functions |
| def randn(seed: Optional[int] = None) -> Column: |
| """Generates a random column with independent and identically distributed (i.i.d.) samples |
| from the standard normal distribution. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The function is non-deterministic in general case. |
| |
| Parameters |
| ---------- |
| seed : int (default: None) |
| Seed value for the random generator. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column of random values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.rand` |
| :meth:`pyspark.sql.functions.randstr` |
| :meth:`pyspark.sql.functions.uniform` |
| |
| Examples |
| -------- |
| Example 1: Generate a random column without a seed |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(0, 2, 1, 1).select("*", sf.randn()).show() # doctest: +SKIP |
| +---+--------------------------+ |
| | id|randn(3968742514375399317)| |
| +---+--------------------------+ |
| | 0| -0.47968645355788...| |
| | 1| -0.4950952457305...| |
| +---+--------------------------+ |
| |
| Example 2: Generate a random column with a specific seed |
| |
| >>> spark.range(0, 2, 1, 1).select("*", sf.randn(seed=42)).show() |
| +---+------------------+ |
| | id| randn(42)| |
| +---+------------------+ |
| | 0| 2.384479054241...| |
| | 1|0.1920934041293...| |
| +---+------------------+ |
| """ |
| if seed is not None: |
| return _invoke_function("randn", _enum_to_value(seed)) |
| else: |
| return _invoke_function("randn") |
| |
| |
| @_try_remote_functions |
| def round(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: |
| """ |
| Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0 |
| or at integral part when `scale` < 0. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or column name to compute the round on. |
| scale : :class:`~pyspark.sql.Column` or int, optional |
| An optional parameter to control the rounding behavior. |
| |
| .. versionchanged:: 4.0.0 |
| Support Column type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column for the rounded value. |
| |
| Examples |
| -------- |
| Example 1: Compute the rounded of a column value |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.round(sf.lit(2.5))).show() |
| +-------------+ |
| |round(2.5, 0)| |
| +-------------+ |
| | 3.0| |
| +-------------+ |
| |
| Example 2: Compute the rounded of a column value with a specified scale |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.round(sf.lit(2.1267), sf.lit(2))).show() |
| +----------------+ |
| |round(2.1267, 2)| |
| +----------------+ |
| | 2.13| |
| +----------------+ |
| """ |
| if scale is None: |
| return _invoke_function_over_columns("round", col) |
| else: |
| scale = _enum_to_value(scale) |
| scale = lit(scale) if isinstance(scale, int) else scale |
| return _invoke_function_over_columns("round", col, scale) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def bround(col: "ColumnOrName", scale: Optional[Union[Column, int]] = None) -> Column: |
| """ |
| Round the given value to `scale` decimal places using HALF_EVEN rounding mode if `scale` >= 0 |
| or at integral part when `scale` < 0. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The target column or column name to compute the round on. |
| scale : :class:`~pyspark.sql.Column` or int, optional |
| An optional parameter to control the rounding behavior. |
| |
| .. versionchanged:: 4.0.0 |
| Support Column type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column for the rounded value. |
| |
| Examples |
| -------- |
| Example 1: Compute the rounded of a column value |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.bround(sf.lit(2.5))).show() |
| +--------------+ |
| |bround(2.5, 0)| |
| +--------------+ |
| | 2.0| |
| +--------------+ |
| |
| Example 2: Compute the rounded of a column value with a specified scale |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.bround(sf.lit(2.1267), sf.lit(2))).show() |
| +-----------------+ |
| |bround(2.1267, 2)| |
| +-----------------+ |
| | 2.13| |
| +-----------------+ |
| """ |
| if scale is None: |
| return _invoke_function_over_columns("bround", col) |
| else: |
| scale = _enum_to_value(scale) |
| scale = lit(scale) if isinstance(scale, int) else scale |
| return _invoke_function_over_columns("bround", col, scale) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def shiftLeft(col: "ColumnOrName", numBits: int) -> Column: |
| """Shift the given value numBits left. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 3.2.0 |
| Use :func:`shiftleft` instead. |
| """ |
| warnings.warn("Deprecated in 3.2, use shiftleft instead.", FutureWarning) |
| return shiftleft(col, numBits) |
| |
| |
| @_try_remote_functions |
| def shiftleft(col: "ColumnOrName", numBits: int) -> Column: |
| """Shift the given value numBits left. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to shift. |
| numBits : int |
| number of bits to shift. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| shifted value. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(4).select("*", sf.shiftleft('id', 1)).show() |
| +---+----------------+ |
| | id|shiftleft(id, 1)| |
| +---+----------------+ |
| | 0| 0| |
| | 1| 2| |
| | 2| 4| |
| | 3| 6| |
| +---+----------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("shiftleft", _to_java_column(col), _enum_to_value(numBits)) |
| |
| |
| @_try_remote_functions |
| def shiftRight(col: "ColumnOrName", numBits: int) -> Column: |
| """(Signed) shift the given value numBits right. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 3.2.0 |
| Use :func:`shiftright` instead. |
| """ |
| warnings.warn("Deprecated in 3.2, use shiftright instead.", FutureWarning) |
| return shiftright(col, numBits) |
| |
| |
| @_try_remote_functions |
| def shiftright(col: "ColumnOrName", numBits: int) -> Column: |
| """(Signed) shift the given value numBits right. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to shift. |
| numBits : int |
| number of bits to shift. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| shifted values. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(4).select("*", sf.shiftright('id', 1)).show() |
| +---+-----------------+ |
| | id|shiftright(id, 1)| |
| +---+-----------------+ |
| | 0| 0| |
| | 1| 0| |
| | 2| 1| |
| | 3| 1| |
| +---+-----------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("shiftright", _to_java_column(col), _enum_to_value(numBits)) |
| |
| |
| @_try_remote_functions |
| def shiftRightUnsigned(col: "ColumnOrName", numBits: int) -> Column: |
| """Unsigned shift the given value numBits right. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 3.2.0 |
| Use :func:`shiftrightunsigned` instead. |
| """ |
| warnings.warn("Deprecated in 3.2, use shiftrightunsigned instead.", FutureWarning) |
| return shiftrightunsigned(col, numBits) |
| |
| |
| @_try_remote_functions |
| def shiftrightunsigned(col: "ColumnOrName", numBits: int) -> Column: |
| """Unsigned shift the given value numBits right. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to shift. |
| numBits : int |
| number of bits to shift. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| shifted value. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(4).select("*", sf.shiftrightunsigned(sf.col('id') - 2, 1)).show() |
| +---+-------------------------------+ |
| | id|shiftrightunsigned((id - 2), 1)| |
| +---+-------------------------------+ |
| | 0| 9223372036854775807| |
| | 1| 9223372036854775807| |
| | 2| 0| |
| | 3| 0| |
| +---+-------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("shiftrightunsigned", _to_java_column(col), _enum_to_value(numBits)) |
| |
| |
| @_try_remote_functions |
| def spark_partition_id() -> Column: |
| """A column for partition ID. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| This is non deterministic because it depends on data partitioning and task scheduling. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| partition id the record belongs to. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(10, numPartitions=5).select("*", sf.spark_partition_id()).show() |
| +---+--------------------+ |
| | id|SPARK_PARTITION_ID()| |
| +---+--------------------+ |
| | 0| 0| |
| | 1| 0| |
| | 2| 1| |
| | 3| 1| |
| | 4| 2| |
| | 5| 2| |
| | 6| 3| |
| | 7| 3| |
| | 8| 4| |
| | 9| 4| |
| +---+--------------------+ |
| """ |
| return _invoke_function("spark_partition_id") |
| |
| |
| @_try_remote_functions |
| def expr(str: str) -> Column: |
| """Parses the expression string into the column that it represents |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| str : expression string |
| expression defined in string. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| column representing the expression. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"]) |
| >>> df.select("*", sf.expr("length(name)")).show() |
| +-----+------------+ |
| | name|length(name)| |
| +-----+------------+ |
| |Alice| 5| |
| | Bob| 3| |
| +-----+------------+ |
| """ |
| return _invoke_function("expr", str) |
| |
| |
| @overload |
| def struct(*cols: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def struct(__cols: Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def struct( |
| *cols: Union["ColumnOrName", Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]] |
| ) -> Column: |
| """Creates a new struct column. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : list, set, :class:`~pyspark.sql.Column` or column name |
| column names or :class:`~pyspark.sql.Column`\\s to contain in the output struct. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a struct type column of given columns. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.named_struct` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) |
| >>> df.select("*", sf.struct('age', df.name)).show() |
| +-----+---+-----------------+ |
| | name|age|struct(age, name)| |
| +-----+---+-----------------+ |
| |Alice| 2| {2, Alice}| |
| | Bob| 5| {5, Bob}| |
| +-----+---+-----------------+ |
| """ |
| if len(cols) == 1 and isinstance(cols[0], (list, set)): |
| cols = cols[0] # type: ignore[assignment] |
| return _invoke_function_over_seq_of_columns("struct", cols) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def named_struct(*cols: "ColumnOrName") -> Column: |
| """ |
| Creates a struct with the given field names and values. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| list of columns to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.struct` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, 2)], ['a', 'b']) |
| >>> df.select("*", sf.named_struct(sf.lit('x'), df.a, sf.lit('y'), "b")).show() |
| +---+---+------------------------+ |
| | a| b|named_struct(x, a, y, b)| |
| +---+---+------------------------+ |
| | 1| 2| {1, 2}| |
| +---+---+------------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("named_struct", cols) |
| |
| |
| @_try_remote_functions |
| def greatest(*cols: "ColumnOrName") -> Column: |
| """ |
| Returns the greatest value of the list of column names, skipping null values. |
| This function takes at least 2 parameters. It will return null if all parameters are null. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols: :class:`~pyspark.sql.Column` or column name |
| columns to check for greatest value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| greatest value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.least` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) |
| >>> df.select("*", sf.greatest(df.a, "b", df.c)).show() |
| +---+---+---+-----------------+ |
| | a| b| c|greatest(a, b, c)| |
| +---+---+---+-----------------+ |
| | 1| 4| 3| 4| |
| +---+---+---+-----------------+ |
| """ |
| if len(cols) < 2: |
| raise PySparkValueError( |
| errorClass="WRONG_NUM_COLUMNS", |
| messageParameters={"func_name": "greatest", "num_cols": "2"}, |
| ) |
| return _invoke_function_over_seq_of_columns("greatest", cols) |
| |
| |
| @_try_remote_functions |
| def least(*cols: "ColumnOrName") -> Column: |
| """ |
| Returns the least value of the list of column names, skipping null values. |
| This function takes at least 2 parameters. It will return null if all parameters are null. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| column names or columns to be compared |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| least value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.greatest` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c']) |
| >>> df.select("*", sf.least(df.a, "b", df.c)).show() |
| +---+---+---+--------------+ |
| | a| b| c|least(a, b, c)| |
| +---+---+---+--------------+ |
| | 1| 4| 3| 1| |
| +---+---+---+--------------+ |
| """ |
| if len(cols) < 2: |
| raise PySparkValueError( |
| errorClass="WRONG_NUM_COLUMNS", |
| messageParameters={"func_name": "least", "num_cols": "2"}, |
| ) |
| return _invoke_function_over_seq_of_columns("least", cols) |
| |
| |
| @_try_remote_functions |
| def when(condition: Column, value: Any) -> Column: |
| """Evaluates a list of conditions and returns one of multiple possible result expressions. |
| If :func:`pyspark.sql.Column.otherwise` is not invoked, None is returned for unmatched |
| conditions. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| condition : :class:`~pyspark.sql.Column` |
| a boolean :class:`~pyspark.sql.Column` expression. |
| value : |
| a literal value, or a :class:`~pyspark.sql.Column` expression. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| column representing when expression. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.Column.when` |
| :meth:`pyspark.sql.Column.otherwise` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.range(3) |
| >>> df.select("*", sf.when(df['id'] == 2, 3).otherwise(4)).show() |
| +---+------------------------------------+ |
| | id|CASE WHEN (id = 2) THEN 3 ELSE 4 END| |
| +---+------------------------------------+ |
| | 0| 4| |
| | 1| 4| |
| | 2| 3| |
| +---+------------------------------------+ |
| |
| >>> df.select("*", sf.when(df.id == 2, df.id + 1)).show() |
| +---+------------------------------------+ |
| | id|CASE WHEN (id = 2) THEN (id + 1) END| |
| +---+------------------------------------+ |
| | 0| NULL| |
| | 1| NULL| |
| | 2| 3| |
| +---+------------------------------------+ |
| """ |
| # Explicitly not using ColumnOrName type here to make reading condition less opaque |
| if not isinstance(condition, Column): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN", |
| messageParameters={"arg_name": "condition", "arg_type": type(condition).__name__}, |
| ) |
| value = _enum_to_value(value) |
| v = value._jc if isinstance(value, Column) else _enum_to_value(value) |
| |
| return _invoke_function("when", condition._jc, v) |
| |
| |
| @overload # type: ignore[no-redef] |
| def log(arg1: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def log(arg1: float, arg2: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def log(arg1: Union["ColumnOrName", float], arg2: Optional["ColumnOrName"] = None) -> Column: |
| """Returns the first argument-based logarithm of the second argument. |
| |
| If there is only one argument, then this takes the natural logarithm of the argument. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| arg1 : :class:`~pyspark.sql.Column`, str or float |
| base number or actual number (in this case base is `e`) |
| arg2 : :class:`~pyspark.sql.Column`, str or float, optional |
| number to calculate logariphm for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| logariphm of given value. |
| |
| Examples |
| -------- |
| Example 1: Specify both base number and the input value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1), (2), (4) AS t(value)") |
| >>> df.select("*", sf.log(2.0, df.value)).show() |
| +-----+---------------+ |
| |value|LOG(2.0, value)| |
| +-----+---------------+ |
| | 1| 0.0| |
| | 2| 1.0| |
| | 4| 2.0| |
| +-----+---------------+ |
| |
| Example 2: Return NULL for invalid input values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1), (2), (0), (-1), (NULL) AS t(value)") |
| >>> df.select("*", sf.log(3.0, df.value)).show() |
| +-----+------------------+ |
| |value| LOG(3.0, value)| |
| +-----+------------------+ |
| | 1| 0.0| |
| | 2|0.6309297535714...| |
| | 0| NULL| |
| | -1| NULL| |
| | NULL| NULL| |
| +-----+------------------+ |
| |
| Example 3: Specify only the input value (Natural logarithm) |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT * FROM VALUES (1), (2), (4) AS t(value)") |
| >>> df.select("*", sf.log(df.value)).show() |
| +-----+------------------+ |
| |value| ln(value)| |
| +-----+------------------+ |
| | 1| 0.0| |
| | 2|0.6931471805599...| |
| | 4|1.3862943611198...| |
| +-----+------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if arg2 is None: |
| return _invoke_function_over_columns("log", cast("ColumnOrName", arg1)) |
| else: |
| return _invoke_function("log", _enum_to_value(arg1), _to_java_column(arg2)) |
| |
| |
| @_try_remote_functions |
| def ln(col: "ColumnOrName") -> Column: |
| """Returns the natural logarithm of the argument. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column to calculate logariphm for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| natural logarithm of given value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(10).select("*", sf.ln('id')).show() |
| +---+------------------+ |
| | id| ln(id)| |
| +---+------------------+ |
| | 0| NULL| |
| | 1| 0.0| |
| | 2|0.6931471805599...| |
| | 3|1.0986122886681...| |
| | 4|1.3862943611198...| |
| | 5|1.6094379124341...| |
| | 6| 1.791759469228...| |
| | 7|1.9459101490553...| |
| | 8|2.0794415416798...| |
| | 9|2.1972245773362...| |
| +---+------------------+ |
| """ |
| return _invoke_function_over_columns("ln", col) |
| |
| |
| @_try_remote_functions |
| def log2(col: "ColumnOrName") -> Column: |
| """Returns the base-2 logarithm of the argument. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column to calculate logariphm for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| logariphm of given value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(10).select("*", sf.log2('id')).show() |
| +---+------------------+ |
| | id| LOG2(id)| |
| +---+------------------+ |
| | 0| NULL| |
| | 1| 0.0| |
| | 2| 1.0| |
| | 3| 1.584962500721...| |
| | 4| 2.0| |
| | 5| 2.321928094887...| |
| | 6| 2.584962500721...| |
| | 7| 2.807354922057...| |
| | 8| 3.0| |
| | 9|3.1699250014423...| |
| +---+------------------+ |
| """ |
| return _invoke_function_over_columns("log2", col) |
| |
| |
| @_try_remote_functions |
| def conv(col: "ColumnOrName", fromBase: int, toBase: int) -> Column: |
| """ |
| Convert a number in a string column from one base to another. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column to convert base for. |
| fromBase: int |
| from base number. |
| toBase: int |
| to base number. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| logariphm of given value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("010101",), ( "101",), ("001",)], ['n']) |
| >>> df.select("*", sf.conv(df.n, 2, 16)).show() |
| +------+--------------+ |
| | n|conv(n, 2, 16)| |
| +------+--------------+ |
| |010101| 15| |
| | 101| 5| |
| | 001| 1| |
| +------+--------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "conv", _to_java_column(col), _enum_to_value(fromBase), _enum_to_value(toBase) |
| ) |
| |
| |
| @_try_remote_functions |
| def factorial(col: "ColumnOrName") -> Column: |
| """ |
| Computes the factorial of the given value. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column to calculate factorial for. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| factorial of given value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(10).select("*", sf.factorial('id')).show() |
| +---+-------------+ |
| | id|factorial(id)| |
| +---+-------------+ |
| | 0| 1| |
| | 1| 1| |
| | 2| 2| |
| | 3| 6| |
| | 4| 24| |
| | 5| 120| |
| | 6| 720| |
| | 7| 5040| |
| | 8| 40320| |
| | 9| 362880| |
| +---+-------------+ |
| """ |
| return _invoke_function_over_columns("factorial", col) |
| |
| |
| # --------------- Window functions ------------------------ |
| |
| |
| @_try_remote_functions |
| def lag(col: "ColumnOrName", offset: int = 1, default: Optional[Any] = None) -> Column: |
| """ |
| Window function: returns the value that is `offset` rows before the current row, and |
| `default` if there is less than `offset` rows before the current row. For example, |
| an `offset` of one will return the previous row at any given point in the window partition. |
| |
| This is equivalent to the LAG function in SQL. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| name of column or expression |
| offset : int, optional default 1 |
| number of row to extend |
| default : optional |
| default value |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value before current row based on `offset`. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.lead` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame( |
| ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"]) |
| >>> df.show() |
| +---+---+ |
| | c1| c2| |
| +---+---+ |
| | a| 1| |
| | a| 2| |
| | a| 3| |
| | b| 8| |
| | b| 2| |
| +---+---+ |
| |
| >>> w = Window.partitionBy("c1").orderBy("c2") |
| >>> df.withColumn("previous_value", sf.lag("c2").over(w)).show() |
| +---+---+--------------+ |
| | c1| c2|previous_value| |
| +---+---+--------------+ |
| | a| 1| NULL| |
| | a| 2| 1| |
| | a| 3| 2| |
| | b| 2| NULL| |
| | b| 8| 2| |
| +---+---+--------------+ |
| |
| >>> df.withColumn("previous_value", sf.lag("c2", 1, 0).over(w)).show() |
| +---+---+--------------+ |
| | c1| c2|previous_value| |
| +---+---+--------------+ |
| | a| 1| 0| |
| | a| 2| 1| |
| | a| 3| 2| |
| | b| 2| 0| |
| | b| 8| 2| |
| +---+---+--------------+ |
| |
| >>> df.withColumn("previous_value", sf.lag("c2", 2, -1).over(w)).show() |
| +---+---+--------------+ |
| | c1| c2|previous_value| |
| +---+---+--------------+ |
| | a| 1| -1| |
| | a| 2| -1| |
| | a| 3| 1| |
| | b| 2| -1| |
| | b| 8| -1| |
| +---+---+--------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "lag", _to_java_column(col), _enum_to_value(offset), _enum_to_value(default) |
| ) |
| |
| |
| @_try_remote_functions |
| def lead(col: "ColumnOrName", offset: int = 1, default: Optional[Any] = None) -> Column: |
| """ |
| Window function: returns the value that is `offset` rows after the current row, and |
| `default` if there is less than `offset` rows after the current row. For example, |
| an `offset` of one will return the next row at any given point in the window partition. |
| |
| This is equivalent to the LEAD function in SQL. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| name of column or expression |
| offset : int, optional default 1 |
| number of row to extend |
| default : optional |
| default value |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value after current row based on `offset`. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.lag` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame( |
| ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"]) |
| >>> df.show() |
| +---+---+ |
| | c1| c2| |
| +---+---+ |
| | a| 1| |
| | a| 2| |
| | a| 3| |
| | b| 8| |
| | b| 2| |
| +---+---+ |
| |
| >>> w = Window.partitionBy("c1").orderBy("c2") |
| >>> df.withColumn("next_value", sf.lead("c2").over(w)).show() |
| +---+---+----------+ |
| | c1| c2|next_value| |
| +---+---+----------+ |
| | a| 1| 2| |
| | a| 2| 3| |
| | a| 3| NULL| |
| | b| 2| 8| |
| | b| 8| NULL| |
| +---+---+----------+ |
| |
| >>> df.withColumn("next_value", sf.lead("c2", 1, 0).over(w)).show() |
| +---+---+----------+ |
| | c1| c2|next_value| |
| +---+---+----------+ |
| | a| 1| 2| |
| | a| 2| 3| |
| | a| 3| 0| |
| | b| 2| 8| |
| | b| 8| 0| |
| +---+---+----------+ |
| |
| >>> df.withColumn("next_value", sf.lead("c2", 2, -1).over(w)).show() |
| +---+---+----------+ |
| | c1| c2|next_value| |
| +---+---+----------+ |
| | a| 1| 3| |
| | a| 2| -1| |
| | a| 3| -1| |
| | b| 2| -1| |
| | b| 8| -1| |
| +---+---+----------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "lead", _to_java_column(col), _enum_to_value(offset), _enum_to_value(default) |
| ) |
| |
| |
| @_try_remote_functions |
| def nth_value(col: "ColumnOrName", offset: int, ignoreNulls: Optional[bool] = False) -> Column: |
| """ |
| Window function: returns the value that is the `offset`\\th row of the window frame |
| (counting from 1), and `null` if the size of window frame is less than `offset` rows. |
| |
| It will return the `offset`\\th non-null value it sees when `ignoreNulls` is set to |
| true. If all values are null, then null is returned. |
| |
| This is equivalent to the nth_value function in SQL. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| name of column or expression |
| offset : int |
| number of row to use as the value |
| ignoreNulls : bool, optional |
| indicates the Nth value should skip null in the |
| determination of which row to use |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value of nth row. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame( |
| ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"]) |
| >>> df.show() |
| +---+---+ |
| | c1| c2| |
| +---+---+ |
| | a| 1| |
| | a| 2| |
| | a| 3| |
| | b| 8| |
| | b| 2| |
| +---+---+ |
| |
| >>> w = Window.partitionBy("c1").orderBy("c2") |
| >>> df.withColumn("nth_value", sf.nth_value("c2", 1).over(w)).show() |
| +---+---+---------+ |
| | c1| c2|nth_value| |
| +---+---+---------+ |
| | a| 1| 1| |
| | a| 2| 1| |
| | a| 3| 1| |
| | b| 2| 2| |
| | b| 8| 2| |
| +---+---+---------+ |
| |
| >>> df.withColumn("nth_value", sf.nth_value("c2", 2).over(w)).show() |
| +---+---+---------+ |
| | c1| c2|nth_value| |
| +---+---+---------+ |
| | a| 1| NULL| |
| | a| 2| 2| |
| | a| 3| 2| |
| | b| 2| NULL| |
| | b| 8| 8| |
| +---+---+---------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "nth_value", _to_java_column(col), _enum_to_value(offset), _enum_to_value(ignoreNulls) |
| ) |
| |
| |
| @_try_remote_functions |
| def any_value(col: "ColumnOrName", ignoreNulls: Optional[Union[bool, Column]] = None) -> Column: |
| """Returns some value of `col` for a group of rows. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| ignoreNulls : :class:`~pyspark.sql.Column` or bool, optional |
| if first value is null then look for first non-null value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| some value of `col` for a group of rows. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"]) |
| >>> df.select(sf.any_value('c1'), sf.any_value('c2')).show() |
| +-------------+-------------+ |
| |any_value(c1)|any_value(c2)| |
| +-------------+-------------+ |
| | NULL| 1| |
| +-------------+-------------+ |
| |
| >>> df.select(sf.any_value('c1', True), sf.any_value('c2', True)).show() |
| +-------------+-------------+ |
| |any_value(c1)|any_value(c2)| |
| +-------------+-------------+ |
| | a| 1| |
| +-------------+-------------+ |
| """ |
| if ignoreNulls is None: |
| return _invoke_function_over_columns("any_value", col) |
| else: |
| ignoreNulls = _enum_to_value(ignoreNulls) |
| ignoreNulls = lit(ignoreNulls) if isinstance(ignoreNulls, bool) else ignoreNulls |
| return _invoke_function_over_columns( |
| "any_value", col, ignoreNulls # type: ignore[arg-type] |
| ) |
| |
| |
| @_try_remote_functions |
| def first_value(col: "ColumnOrName", ignoreNulls: Optional[Union[bool, Column]] = None) -> Column: |
| """Returns the first value of `col` for a group of rows. It will return the first non-null |
| value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| ignoreNulls : :class:`~pyspark.sql.Column` or bool, optional |
| if first value is null then look for first non-null value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| some value of `col` for a group of rows. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.last_value` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["a", "b"] |
| ... ).select(sf.first_value('a'), sf.first_value('b')).show() |
| +--------------+--------------+ |
| |first_value(a)|first_value(b)| |
| +--------------+--------------+ |
| | NULL| 1| |
| +--------------+--------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["a", "b"] |
| ... ).select(sf.first_value('a', True), sf.first_value('b', True)).show() |
| +--------------+--------------+ |
| |first_value(a)|first_value(b)| |
| +--------------+--------------+ |
| | a| 1| |
| +--------------+--------------+ |
| """ |
| if ignoreNulls is None: |
| return _invoke_function_over_columns("first_value", col) |
| else: |
| ignoreNulls = _enum_to_value(ignoreNulls) |
| ignoreNulls = lit(ignoreNulls) if isinstance(ignoreNulls, bool) else ignoreNulls |
| return _invoke_function_over_columns( |
| "first_value", col, ignoreNulls # type: ignore[arg-type] |
| ) |
| |
| |
| @_try_remote_functions |
| def last_value(col: "ColumnOrName", ignoreNulls: Optional[Union[bool, Column]] = None) -> Column: |
| """Returns the last value of `col` for a group of rows. It will return the last non-null |
| value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| ignoreNulls : :class:`~pyspark.sql.Column` or bool, optional |
| if first value is null then look for first non-null value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| some value of `col` for a group of rows. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.first_value` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"] |
| ... ).select(sf.last_value('a'), sf.last_value('b')).show() |
| +-------------+-------------+ |
| |last_value(a)|last_value(b)| |
| +-------------+-------------+ |
| | NULL| 2| |
| +-------------+-------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"] |
| ... ).select(sf.last_value('a', True), sf.last_value('b', True)).show() |
| +-------------+-------------+ |
| |last_value(a)|last_value(b)| |
| +-------------+-------------+ |
| | b| 2| |
| +-------------+-------------+ |
| """ |
| if ignoreNulls is None: |
| return _invoke_function_over_columns("last_value", col) |
| else: |
| ignoreNulls = _enum_to_value(ignoreNulls) |
| ignoreNulls = lit(ignoreNulls) if isinstance(ignoreNulls, bool) else ignoreNulls |
| return _invoke_function_over_columns( |
| "last_value", col, ignoreNulls # type: ignore[arg-type] |
| ) |
| |
| |
| @_try_remote_functions |
| def count_if(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: Returns the number of `TRUE` values for the `col`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of `TRUE` values for the `col`. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.count` |
| |
| Examples |
| -------- |
| Example 1: Counting the number of even numbers in a numeric column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"]) |
| >>> df.select(sf.count_if(sf.col('c2') % 2 == 0)).show() |
| +------------------------+ |
| |count_if(((c2 % 2) = 0))| |
| +------------------------+ |
| | 3| |
| +------------------------+ |
| |
| Example 2: Counting the number of rows where a string column starts with a certain letter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("apple",), ("banana",), ("cherry",), ("apple",), ("banana",)], ["fruit"]) |
| >>> df.select(sf.count_if(sf.col('fruit').startswith('a'))).show() |
| +------------------------------+ |
| |count_if(startswith(fruit, a))| |
| +------------------------------+ |
| | 2| |
| +------------------------------+ |
| |
| Example 3: Counting the number of rows where a numeric column is greater than a certain value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["num"]) |
| >>> df.select(sf.count_if(sf.col('num') > 3)).show() |
| +-------------------+ |
| |count_if((num > 3))| |
| +-------------------+ |
| | 2| |
| +-------------------+ |
| |
| Example 4: Counting the number of rows where a boolean column is True |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(True,), (False,), (True,), (False,), (True,)], ["b"]) |
| >>> df.select(sf.count('b'), sf.count_if('b')).show() |
| +--------+-----------+ |
| |count(b)|count_if(b)| |
| +--------+-----------+ |
| | 5| 3| |
| +--------+-----------+ |
| """ |
| return _invoke_function_over_columns("count_if", col) |
| |
| |
| @_try_remote_functions |
| def histogram_numeric(col: "ColumnOrName", nBins: Column) -> Column: |
| """Computes a histogram on numeric 'col' using nb bins. |
| The return value is an array of (x,y) pairs representing the centers of the |
| histogram's bins. As the value of 'nb' is increased, the histogram approximation |
| gets finer-grained, but may yield artifacts around outliers. In practice, 20-40 |
| histogram bins appear to work well, with more bins being required for skewed or |
| smaller datasets. Note that this function creates a histogram with non-uniform |
| bin widths. It offers no guarantees in terms of the mean-squared-error of the |
| histogram, but in practice is comparable to the histograms produced by the R/S-Plus |
| statistical computing packages. Note: the output type of the 'x' field in the return value is |
| propagated from the input value consumed in the aggregate function. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| nBins : :class:`~pyspark.sql.Column` |
| number of Histogram columns. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a histogram on numeric 'col' using nb bins. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(100, numPartitions=1) |
| >>> df.select(sf.histogram_numeric('id', sf.lit(5))).show(truncate=False) |
| +-----------------------------------------------------------+ |
| |histogram_numeric(id, 5) | |
| +-----------------------------------------------------------+ |
| |[{11, 25.0}, {36, 24.0}, {59, 23.0}, {84, 25.0}, {98, 3.0}]| |
| +-----------------------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("histogram_numeric", col, nBins) |
| |
| |
| @_try_remote_functions |
| def ntile(n: int) -> Column: |
| """ |
| Window function: returns the ntile group id (from 1 to `n` inclusive) |
| in an ordered window partition. For example, if `n` is 4, the first |
| quarter of the rows will get value 1, the second quarter will get 2, |
| the third quarter will get 3, and the last quarter will get 4. |
| |
| This is equivalent to the NTILE function in SQL. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| n : int |
| an integer |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| portioned group id. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql import Window |
| >>> df = spark.createDataFrame( |
| ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["c1", "c2"]) |
| >>> df.show() |
| +---+---+ |
| | c1| c2| |
| +---+---+ |
| | a| 1| |
| | a| 2| |
| | a| 3| |
| | b| 8| |
| | b| 2| |
| +---+---+ |
| |
| >>> w = Window.partitionBy("c1").orderBy("c2") |
| >>> df.withColumn("ntile", sf.ntile(2).over(w)).show() |
| +---+---+-----+ |
| | c1| c2|ntile| |
| +---+---+-----+ |
| | a| 1| 1| |
| | a| 2| 1| |
| | a| 3| 2| |
| | b| 2| 1| |
| | b| 8| 2| |
| +---+---+-----+ |
| """ |
| return _invoke_function("ntile", int(_enum_to_value(n))) |
| |
| |
| # ---------------------- Date/Timestamp functions ------------------------------ |
| |
| |
| @_try_remote_functions |
| def curdate() -> Column: |
| """ |
| Returns the current date at the start of query evaluation as a :class:`DateType` column. |
| All calls of current_date within the same query return the same value. |
| |
| .. versionadded:: 3.5.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current date. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.now` |
| :meth:`pyspark.sql.functions.current_date` |
| :meth:`pyspark.sql.functions.current_timestamp` |
| :meth:`pyspark.sql.functions.localtimestamp` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.curdate()).show() # doctest: +SKIP |
| +--------------+ |
| |current_date()| |
| +--------------+ |
| | 2022-08-26| |
| +--------------+ |
| """ |
| return _invoke_function("curdate") |
| |
| |
| @_try_remote_functions |
| def current_date() -> Column: |
| """ |
| Returns the current date at the start of query evaluation as a :class:`DateType` column. |
| All calls of current_date within the same query return the same value. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current date. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.now` |
| :meth:`pyspark.sql.functions.curdate` |
| :meth:`pyspark.sql.functions.current_timestamp` |
| :meth:`pyspark.sql.functions.localtimestamp` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.current_date()).show() # doctest: +SKIP |
| +--------------+ |
| |current_date()| |
| +--------------+ |
| | 2022-08-26| |
| +--------------+ |
| """ |
| return _invoke_function("current_date") |
| |
| |
| @_try_remote_functions |
| def current_timezone() -> Column: |
| """ |
| Returns the current session local timezone. |
| |
| .. versionadded:: 3.5.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current session local timezone. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.convert_timezone` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.current_timezone()).show() |
| +-------------------+ |
| | current_timezone()| |
| +-------------------+ |
| |America/Los_Angeles| |
| +-------------------+ |
| |
| Switch the timezone to Shanghai. |
| |
| >>> spark.conf.set("spark.sql.session.timeZone", "Asia/Shanghai") |
| >>> spark.range(1).select(sf.current_timezone()).show() |
| +------------------+ |
| |current_timezone()| |
| +------------------+ |
| | Asia/Shanghai| |
| +------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function("current_timezone") |
| |
| |
| @overload |
| def current_time() -> Column: |
| ... |
| |
| |
| @overload |
| def current_time(precision: int) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def current_time(precision: Optional[int] = None) -> Column: |
| """ |
| Returns the current time at the start of query evaluation as a :class:`TimeType` column. All |
| calls of current_time within the same query return the same value. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| precision: literal int, optional |
| number in the range [0..6], indicating how many fractional digits of seconds to include. |
| If omitted, the default is 6. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current time. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.current_date` |
| :meth:`pyspark.sql.functions.current_timestamp` |
| |
| Examples |
| -------- |
| Example 1: Current time with default precision |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.current_time().alias("time")).show() # doctest: +SKIP |
| +---------------+ |
| | time| |
| +---------------+ |
| |16:57:04.304361| |
| +---------------+ |
| |
| Example 2: Current time with specified precision |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.current_time(3).alias("time")).show() # doctest: +SKIP |
| +------------+ |
| | time| |
| +------------+ |
| |16:57:04.304| |
| +------------+ |
| """ |
| if precision is None: |
| return _invoke_function("current_time") |
| else: |
| return _invoke_function("current_time", _enum_to_value(precision)) |
| |
| |
| @_try_remote_functions |
| def current_timestamp() -> Column: |
| """ |
| Returns the current timestamp at the start of query evaluation as a :class:`TimestampType` |
| column. All calls of current_timestamp within the same query return the same value. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current date and time. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.now` |
| :meth:`pyspark.sql.functions.curdate` |
| :meth:`pyspark.sql.functions.current_date` |
| :meth:`pyspark.sql.functions.localtimestamp` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.current_timestamp()).show(truncate=False) # doctest: +SKIP |
| +-----------------------+ |
| |current_timestamp() | |
| +-----------------------+ |
| |2022-08-26 21:23:22.716| |
| +-----------------------+ |
| """ |
| return _invoke_function("current_timestamp") |
| |
| |
| @_try_remote_functions |
| def now() -> Column: |
| """ |
| Returns the current timestamp at the start of query evaluation. |
| |
| .. versionadded:: 3.5.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current timestamp at the start of query evaluation. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.curdate` |
| :meth:`pyspark.sql.functions.current_date` |
| :meth:`pyspark.sql.functions.current_timestamp` |
| :meth:`pyspark.sql.functions.localtimestamp` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.now()).show(truncate=False) # doctest: +SKIP |
| +--------------------------+ |
| |now() | |
| +--------------------------+ |
| |2023-12-08 15:18:18.482269| |
| +--------------------------+ |
| """ |
| return _invoke_function("now") |
| |
| |
| @_try_remote_functions |
| def localtimestamp() -> Column: |
| """ |
| Returns the current timestamp without time zone at the start of query evaluation |
| as a timestamp without time zone column. All calls of localtimestamp within the |
| same query return the same value. |
| |
| .. versionadded:: 3.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| current local date and time. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.now` |
| :meth:`pyspark.sql.functions.curdate` |
| :meth:`pyspark.sql.functions.current_date` |
| :meth:`pyspark.sql.functions.current_timestamp` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.localtimestamp()).show(truncate=False) # doctest: +SKIP |
| +-----------------------+ |
| |localtimestamp() | |
| +-----------------------+ |
| |2022-08-26 21:28:34.639| |
| +-----------------------+ |
| """ |
| return _invoke_function("localtimestamp") |
| |
| |
| @_try_remote_functions |
| def date_format(date: "ColumnOrName", format: str) -> Column: |
| """ |
| Converts a date/timestamp/string to a value of string in the format specified by the date |
| format given by the second argument. |
| |
| A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All |
| pattern letters of `datetime pattern`_. can be used. |
| |
| .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| Whenever possible, use specialized functions like `year`. |
| |
| Parameters |
| ---------- |
| date : :class:`~pyspark.sql.Column` or column name |
| input column of values to format. |
| format: literal string |
| format to use to represent datetime values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_date` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.try_to_timestamp` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string value representing formatted datetime. |
| |
| Examples |
| -------- |
| Example 1: Format a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.date_format('dt', 'MM/dd/yyyy')).show() |
| +----------+----------+---------------------------+ |
| | dt|typeof(dt)|date_format(dt, MM/dd/yyyy)| |
| +----------+----------+---------------------------+ |
| |2015-04-08| string| 04/08/2015| |
| |2024-10-31| string| 10/31/2024| |
| +----------+----------+---------------------------+ |
| |
| Example 2: Format a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.date_format('ts', 'yy=MM=dd HH=mm=ss')).show() |
| +-------------------+----------+----------------------------------+ |
| | ts|typeof(ts)|date_format(ts, yy=MM=dd HH=mm=ss)| |
| +-------------------+----------+----------------------------------+ |
| |2015-04-08 13:08:15| string| 15=04=08 13=08=15| |
| |2024-10-31 10:09:16| string| 24=10=31 10=09=16| |
| +-------------------+----------+----------------------------------+ |
| |
| Example 3: Format a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.date_format('dt', 'yy--MM--dd')).show() |
| +----------+----------+---------------------------+ |
| | dt|typeof(dt)|date_format(dt, yy--MM--dd)| |
| +----------+----------+---------------------------+ |
| |2015-04-08| date| 15--04--08| |
| |2024-10-31| date| 24--10--31| |
| +----------+----------+---------------------------+ |
| |
| Example 4: Format a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.date_format('ts', 'yy=MM=dd HH=mm=ss')).show() |
| +-------------------+----------+----------------------------------+ |
| | ts|typeof(ts)|date_format(ts, yy=MM=dd HH=mm=ss)| |
| +-------------------+----------+----------------------------------+ |
| |2015-04-08 13:08:15| timestamp| 15=04=08 13=08=15| |
| |2024-10-31 10:09:16| timestamp| 24=10=31 10=09=16| |
| +-------------------+----------+----------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("date_format", _to_java_column(date), _enum_to_value(format)) |
| |
| |
| @_try_remote_functions |
| def year(col: "ColumnOrName") -> Column: |
| """ |
| Extract the year of a given date/timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| year part of the date/timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| Example 1: Extract the year from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.year('dt')).show() |
| +----------+----------+--------+ |
| | dt|typeof(dt)|year(dt)| |
| +----------+----------+--------+ |
| |2015-04-08| string| 2015| |
| |2024-10-31| string| 2024| |
| +----------+----------+--------+ |
| |
| Example 2: Extract the year from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.year('ts')).show() |
| +-------------------+----------+--------+ |
| | ts|typeof(ts)|year(ts)| |
| +-------------------+----------+--------+ |
| |2015-04-08 13:08:15| string| 2015| |
| |2024-10-31 10:09:16| string| 2024| |
| +-------------------+----------+--------+ |
| |
| Example 3: Extract the year from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.year('dt')).show() |
| +----------+----------+--------+ |
| | dt|typeof(dt)|year(dt)| |
| +----------+----------+--------+ |
| |2015-04-08| date| 2015| |
| |2024-10-31| date| 2024| |
| +----------+----------+--------+ |
| |
| Example 4: Extract the year from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.year('ts')).show() |
| +-------------------+----------+--------+ |
| | ts|typeof(ts)|year(ts)| |
| +-------------------+----------+--------+ |
| |2015-04-08 13:08:15| timestamp| 2015| |
| |2024-10-31 10:09:16| timestamp| 2024| |
| +-------------------+----------+--------+ |
| """ |
| return _invoke_function_over_columns("year", col) |
| |
| |
| @_try_remote_functions |
| def quarter(col: "ColumnOrName") -> Column: |
| """ |
| Extract the quarter of a given date/timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| quarter of the date/timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| Example 1: Extract the quarter from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.quarter('dt')).show() |
| +----------+----------+-----------+ |
| | dt|typeof(dt)|quarter(dt)| |
| +----------+----------+-----------+ |
| |2015-04-08| string| 2| |
| |2024-10-31| string| 4| |
| +----------+----------+-----------+ |
| |
| Example 2: Extract the quarter from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.quarter('ts')).show() |
| +-------------------+----------+-----------+ |
| | ts|typeof(ts)|quarter(ts)| |
| +-------------------+----------+-----------+ |
| |2015-04-08 13:08:15| string| 2| |
| |2024-10-31 10:09:16| string| 4| |
| +-------------------+----------+-----------+ |
| |
| Example 3: Extract the quarter from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.quarter('dt')).show() |
| +----------+----------+-----------+ |
| | dt|typeof(dt)|quarter(dt)| |
| +----------+----------+-----------+ |
| |2015-04-08| date| 2| |
| |2024-10-31| date| 4| |
| +----------+----------+-----------+ |
| |
| Example 4: Extract the quarter from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.quarter('ts')).show() |
| +-------------------+----------+-----------+ |
| | ts|typeof(ts)|quarter(ts)| |
| +-------------------+----------+-----------+ |
| |2015-04-08 13:08:15| timestamp| 2| |
| |2024-10-31 10:09:16| timestamp| 4| |
| +-------------------+----------+-----------+ |
| """ |
| return _invoke_function_over_columns("quarter", col) |
| |
| |
| @_try_remote_functions |
| def month(col: "ColumnOrName") -> Column: |
| """ |
| Extract the month of a given date/timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| month part of the date/timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.monthname` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| Example 1: Extract the month from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.month('dt')).show() |
| +----------+----------+---------+ |
| | dt|typeof(dt)|month(dt)| |
| +----------+----------+---------+ |
| |2015-04-08| string| 4| |
| |2024-10-31| string| 10| |
| +----------+----------+---------+ |
| |
| Example 2: Extract the month from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.month('ts')).show() |
| +-------------------+----------+---------+ |
| | ts|typeof(ts)|month(ts)| |
| +-------------------+----------+---------+ |
| |2015-04-08 13:08:15| string| 4| |
| |2024-10-31 10:09:16| string| 10| |
| +-------------------+----------+---------+ |
| |
| Example 3: Extract the month from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.month('dt')).show() |
| +----------+----------+---------+ |
| | dt|typeof(dt)|month(dt)| |
| +----------+----------+---------+ |
| |2015-04-08| date| 4| |
| |2024-10-31| date| 10| |
| +----------+----------+---------+ |
| |
| Example 3: Extract the month from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.month('ts')).show() |
| +-------------------+----------+---------+ |
| | ts|typeof(ts)|month(ts)| |
| +-------------------+----------+---------+ |
| |2015-04-08 13:08:15| timestamp| 4| |
| |2024-10-31 10:09:16| timestamp| 10| |
| +-------------------+----------+---------+ |
| """ |
| return _invoke_function_over_columns("month", col) |
| |
| |
| @_try_remote_functions |
| def dayofweek(col: "ColumnOrName") -> Column: |
| """ |
| Extract the day of the week of a given date/timestamp as integer. |
| Ranges from 1 for a Sunday through to 7 for a Saturday |
| |
| .. versionadded:: 2.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| day of the week for given date/timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.dayofyear` |
| :meth:`pyspark.sql.functions.dayofmonth` |
| :meth:`pyspark.sql.functions.weekofyear` |
| |
| Examples |
| -------- |
| Example 1: Extract the day of the week from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayofweek('dt')).show() |
| +----------+----------+-------------+ |
| | dt|typeof(dt)|dayofweek(dt)| |
| +----------+----------+-------------+ |
| |2015-04-08| string| 4| |
| |2024-10-31| string| 5| |
| +----------+----------+-------------+ |
| |
| Example 2: Extract the day of the week from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayofweek('ts')).show() |
| +-------------------+----------+-------------+ |
| | ts|typeof(ts)|dayofweek(ts)| |
| +-------------------+----------+-------------+ |
| |2015-04-08 13:08:15| string| 4| |
| |2024-10-31 10:09:16| string| 5| |
| +-------------------+----------+-------------+ |
| |
| Example 3: Extract the day of the week from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayofweek('dt')).show() |
| +----------+----------+-------------+ |
| | dt|typeof(dt)|dayofweek(dt)| |
| +----------+----------+-------------+ |
| |2015-04-08| date| 4| |
| |2024-10-31| date| 5| |
| +----------+----------+-------------+ |
| |
| Example 4: Extract the day of the week from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayofweek('ts')).show() |
| +-------------------+----------+-------------+ |
| | ts|typeof(ts)|dayofweek(ts)| |
| +-------------------+----------+-------------+ |
| |2015-04-08 13:08:15| timestamp| 4| |
| |2024-10-31 10:09:16| timestamp| 5| |
| +-------------------+----------+-------------+ |
| """ |
| return _invoke_function_over_columns("dayofweek", col) |
| |
| |
| @_try_remote_functions |
| def dayofmonth(col: "ColumnOrName") -> Column: |
| """ |
| Extract the day of the month of a given date/timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.dayofyear` |
| :meth:`pyspark.sql.functions.dayofweek` |
| :meth:`pyspark.sql.functions.weekofyear` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| day of the month for given date/timestamp as integer. |
| |
| Examples |
| -------- |
| Example 1: Extract the day of the month from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayofmonth('dt')).show() |
| +----------+----------+--------------+ |
| | dt|typeof(dt)|dayofmonth(dt)| |
| +----------+----------+--------------+ |
| |2015-04-08| string| 8| |
| |2024-10-31| string| 31| |
| +----------+----------+--------------+ |
| |
| Example 2: Extract the day of the month from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayofmonth('ts')).show() |
| +-------------------+----------+--------------+ |
| | ts|typeof(ts)|dayofmonth(ts)| |
| +-------------------+----------+--------------+ |
| |2015-04-08 13:08:15| string| 8| |
| |2024-10-31 10:09:16| string| 31| |
| +-------------------+----------+--------------+ |
| |
| Example 3: Extract the day of the month from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayofmonth('dt')).show() |
| +----------+----------+--------------+ |
| | dt|typeof(dt)|dayofmonth(dt)| |
| +----------+----------+--------------+ |
| |2015-04-08| date| 8| |
| |2024-10-31| date| 31| |
| +----------+----------+--------------+ |
| |
| Example 4: Extract the day of the month from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayofmonth('ts')).show() |
| +-------------------+----------+--------------+ |
| | ts|typeof(ts)|dayofmonth(ts)| |
| +-------------------+----------+--------------+ |
| |2015-04-08 13:08:15| timestamp| 8| |
| |2024-10-31 10:09:16| timestamp| 31| |
| +-------------------+----------+--------------+ |
| """ |
| return _invoke_function_over_columns("dayofmonth", col) |
| |
| |
| @_try_remote_functions |
| def day(col: "ColumnOrName") -> Column: |
| """ |
| Extract the day of the month of a given date/timestamp as integer. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| day of the month for given date/timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.dayname` |
| :meth:`pyspark.sql.functions.dayofyear` |
| :meth:`pyspark.sql.functions.dayofmonth` |
| :meth:`pyspark.sql.functions.dayofweek` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| Example 1: Extract the day of the month from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.day('dt')).show() |
| +----------+----------+-------+ |
| | dt|typeof(dt)|day(dt)| |
| +----------+----------+-------+ |
| |2015-04-08| string| 8| |
| |2024-10-31| string| 31| |
| +----------+----------+-------+ |
| |
| Example 2: Extract the day of the month from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.day('ts')).show() |
| +-------------------+----------+-------+ |
| | ts|typeof(ts)|day(ts)| |
| +-------------------+----------+-------+ |
| |2015-04-08 13:08:15| string| 8| |
| |2024-10-31 10:09:16| string| 31| |
| +-------------------+----------+-------+ |
| |
| Example 3: Extract the day of the month from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.day('dt')).show() |
| +----------+----------+-------+ |
| | dt|typeof(dt)|day(dt)| |
| +----------+----------+-------+ |
| |2015-04-08| date| 8| |
| |2024-10-31| date| 31| |
| +----------+----------+-------+ |
| |
| Example 4: Extract the day of the month from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.day('ts')).show() |
| +-------------------+----------+-------+ |
| | ts|typeof(ts)|day(ts)| |
| +-------------------+----------+-------+ |
| |2015-04-08 13:08:15| timestamp| 8| |
| |2024-10-31 10:09:16| timestamp| 31| |
| +-------------------+----------+-------+ |
| """ |
| return _invoke_function_over_columns("day", col) |
| |
| |
| @_try_remote_functions |
| def dayofyear(col: "ColumnOrName") -> Column: |
| """ |
| Extract the day of the year of a given date/timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| day of the year for given date/timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.dayofyear` |
| :meth:`pyspark.sql.functions.dayofmonth` |
| :meth:`pyspark.sql.functions.weekofyear` |
| |
| Examples |
| -------- |
| Example 1: Extract the day of the year from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayofyear('dt')).show() |
| +----------+----------+-------------+ |
| | dt|typeof(dt)|dayofyear(dt)| |
| +----------+----------+-------------+ |
| |2015-04-08| string| 98| |
| |2024-10-31| string| 305| |
| +----------+----------+-------------+ |
| |
| Example 2: Extract the day of the year from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayofyear('ts')).show() |
| +-------------------+----------+-------------+ |
| | ts|typeof(ts)|dayofyear(ts)| |
| +-------------------+----------+-------------+ |
| |2015-04-08 13:08:15| string| 98| |
| |2024-10-31 10:09:16| string| 305| |
| +-------------------+----------+-------------+ |
| |
| Example 3: Extract the day of the year from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayofyear('dt')).show() |
| +----------+----------+-------------+ |
| | dt|typeof(dt)|dayofyear(dt)| |
| +----------+----------+-------------+ |
| |2015-04-08| date| 98| |
| |2024-10-31| date| 305| |
| +----------+----------+-------------+ |
| |
| Example 4: Extract the day of the year from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayofyear('ts')).show() |
| +-------------------+----------+-------------+ |
| | ts|typeof(ts)|dayofyear(ts)| |
| +-------------------+----------+-------------+ |
| |2015-04-08 13:08:15| timestamp| 98| |
| |2024-10-31 10:09:16| timestamp| 305| |
| +-------------------+----------+-------------+ |
| """ |
| return _invoke_function_over_columns("dayofyear", col) |
| |
| |
| @_try_remote_functions |
| def hour(col: "ColumnOrName") -> Column: |
| """ |
| Extract the hours of a given timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. versionchanged:: 4.1.0 |
| Added support for time type. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/time/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hour part of the timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| Example 1: Extract the hours from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.hour('ts')).show() |
| +-------------------+----------+--------+ |
| | ts|typeof(ts)|hour(ts)| |
| +-------------------+----------+--------+ |
| |2015-04-08 13:08:15| string| 13| |
| |2024-10-31 10:09:16| string| 10| |
| +-------------------+----------+--------+ |
| |
| Example 2: Extract the hours from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.hour('ts')).show() |
| +-------------------+----------+--------+ |
| | ts|typeof(ts)|hour(ts)| |
| +-------------------+----------+--------+ |
| |2015-04-08 13:08:15| timestamp| 13| |
| |2024-10-31 10:09:16| timestamp| 10| |
| +-------------------+----------+--------+ |
| |
| Example 3: Extract the hours from a time column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("13:08:15",), |
| ... ("10:09:16",)], ['t']).withColumn("t", sf.col("t").cast("time")) |
| >>> df.select("*", sf.typeof('t'), sf.hour('t')).show() |
| +--------+---------+-------+ |
| | t|typeof(t)|hour(t)| |
| +--------+---------+-------+ |
| |13:08:15| time(6)| 13| |
| |10:09:16| time(6)| 10| |
| +--------+---------+-------+ |
| """ |
| return _invoke_function_over_columns("hour", col) |
| |
| |
| @_try_remote_functions |
| def minute(col: "ColumnOrName") -> Column: |
| """ |
| Extract the minutes of a given timestamp as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. versionchanged:: 4.1.0 |
| Added support for time type. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/time/timestamp column to work on. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| minutes part of the timestamp as integer. |
| |
| Examples |
| -------- |
| Example 1: Extract the minutes from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.minute('ts')).show() |
| +-------------------+----------+----------+ |
| | ts|typeof(ts)|minute(ts)| |
| +-------------------+----------+----------+ |
| |2015-04-08 13:08:15| string| 8| |
| |2024-10-31 10:09:16| string| 9| |
| +-------------------+----------+----------+ |
| |
| Example 2: Extract the minutes from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.minute('ts')).show() |
| +-------------------+----------+----------+ |
| | ts|typeof(ts)|minute(ts)| |
| +-------------------+----------+----------+ |
| |2015-04-08 13:08:15| timestamp| 8| |
| |2024-10-31 10:09:16| timestamp| 9| |
| +-------------------+----------+----------+ |
| |
| Example 3: Extract the minutes from a time column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("13:08:15",), |
| ... ("10:09:16",)], ['t']).withColumn("t", sf.col("t").cast("time")) |
| >>> df.select("*", sf.typeof('t'), sf.minute('t')).show() |
| +--------+---------+---------+ |
| | t|typeof(t)|minute(t)| |
| +--------+---------+---------+ |
| |13:08:15| time(6)| 8| |
| |10:09:16| time(6)| 9| |
| +--------+---------+---------+ |
| """ |
| return _invoke_function_over_columns("minute", col) |
| |
| |
| @_try_remote_functions |
| def second(col: "ColumnOrName") -> Column: |
| """ |
| Extract the seconds of a given date as integer. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. versionchanged:: 4.1.0 |
| Added support for time type. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/time/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| `seconds` part of the timestamp as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.extract` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| Example 1: Extract the seconds from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.second('ts')).show() |
| +-------------------+----------+----------+ |
| | ts|typeof(ts)|second(ts)| |
| +-------------------+----------+----------+ |
| |2015-04-08 13:08:15| string| 15| |
| |2024-10-31 10:09:16| string| 16| |
| +-------------------+----------+----------+ |
| |
| Example 2: Extract the seconds from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.second('ts')).show() |
| +-------------------+----------+----------+ |
| | ts|typeof(ts)|second(ts)| |
| +-------------------+----------+----------+ |
| |2015-04-08 13:08:15| timestamp| 15| |
| |2024-10-31 10:09:16| timestamp| 16| |
| +-------------------+----------+----------+ |
| |
| Example 3: Extract the seconds from a time column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ("13:08:15",), |
| ... ("10:09:16",)], ['t']).withColumn("t", sf.col("t").cast("time")) |
| >>> df.select("*", sf.typeof('t'), sf.second('t')).show() |
| +--------+---------+---------+ |
| | t|typeof(t)|second(t)| |
| +--------+---------+---------+ |
| |13:08:15| time(6)| 15| |
| |10:09:16| time(6)| 16| |
| +--------+---------+---------+ |
| """ |
| return _invoke_function_over_columns("second", col) |
| |
| |
| @_try_remote_functions |
| def weekofyear(col: "ColumnOrName") -> Column: |
| """ |
| Extract the week number of a given date as integer. |
| A week is considered to start on a Monday and week 1 is the first week with more than 3 days, |
| as defined by ISO 8601 |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| `week` of the year for given date as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.weekday` |
| :meth:`pyspark.sql.functions.dayofweek` |
| :meth:`pyspark.sql.functions.dayofmonth` |
| :meth:`pyspark.sql.functions.dayofyear` |
| |
| Examples |
| -------- |
| Example 1: Extract the week of the year from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.weekofyear('dt')).show() |
| +----------+----------+--------------+ |
| | dt|typeof(dt)|weekofyear(dt)| |
| +----------+----------+--------------+ |
| |2015-04-08| string| 15| |
| |2024-10-31| string| 44| |
| +----------+----------+--------------+ |
| |
| Example 2: Extract the week of the year from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.weekofyear('ts')).show() |
| +-------------------+----------+--------------+ |
| | ts|typeof(ts)|weekofyear(ts)| |
| +-------------------+----------+--------------+ |
| |2015-04-08 13:08:15| string| 15| |
| |2024-10-31 10:09:16| string| 44| |
| +-------------------+----------+--------------+ |
| |
| Example 3: Extract the week of the year from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.weekofyear('dt')).show() |
| +----------+----------+--------------+ |
| | dt|typeof(dt)|weekofyear(dt)| |
| +----------+----------+--------------+ |
| |2015-04-08| date| 15| |
| |2024-10-31| date| 44| |
| +----------+----------+--------------+ |
| |
| Example 4: Extract the week of the year from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.weekofyear('ts')).show() |
| +-------------------+----------+--------------+ |
| | ts|typeof(ts)|weekofyear(ts)| |
| +-------------------+----------+--------------+ |
| |2015-04-08 13:08:15| timestamp| 15| |
| |2024-10-31 10:09:16| timestamp| 44| |
| +-------------------+----------+--------------+ |
| """ |
| return _invoke_function_over_columns("weekofyear", col) |
| |
| |
| @_try_remote_functions |
| def weekday(col: "ColumnOrName") -> Column: |
| """ |
| Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday). |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday). |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.weekofyear` |
| |
| Examples |
| -------- |
| Example 1: Extract the day of the week from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.weekday('dt')).show() |
| +----------+----------+-----------+ |
| | dt|typeof(dt)|weekday(dt)| |
| +----------+----------+-----------+ |
| |2015-04-08| string| 2| |
| |2024-10-31| string| 3| |
| +----------+----------+-----------+ |
| |
| Example 2: Extract the day of the week from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.weekday('ts')).show() |
| +-------------------+----------+-----------+ |
| | ts|typeof(ts)|weekday(ts)| |
| +-------------------+----------+-----------+ |
| |2015-04-08 13:08:15| string| 2| |
| |2024-10-31 10:09:16| string| 3| |
| +-------------------+----------+-----------+ |
| |
| Example 3: Extract the day of the week from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.weekday('dt')).show() |
| +----------+----------+-----------+ |
| | dt|typeof(dt)|weekday(dt)| |
| +----------+----------+-----------+ |
| |2015-04-08| date| 2| |
| |2024-10-31| date| 3| |
| +----------+----------+-----------+ |
| |
| Example 4: Extract the day of the week from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.weekday('ts')).show() |
| +-------------------+----------+-----------+ |
| | ts|typeof(ts)|weekday(ts)| |
| +-------------------+----------+-----------+ |
| |2015-04-08 13:08:15| timestamp| 2| |
| |2024-10-31 10:09:16| timestamp| 3| |
| +-------------------+----------+-----------+ |
| """ |
| return _invoke_function_over_columns("weekday", col) |
| |
| |
| @_try_remote_functions |
| def monthname(col: "ColumnOrName") -> Column: |
| """ |
| Returns the three-letter abbreviated month name from the given date. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the three-letter abbreviation of month name for date/timestamp (Jan, Feb, Mar...) |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.dayname` |
| |
| Examples |
| -------- |
| Example 1: Extract the month name from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.monthname('dt')).show() |
| +----------+----------+-------------+ |
| | dt|typeof(dt)|monthname(dt)| |
| +----------+----------+-------------+ |
| |2015-04-08| string| Apr| |
| |2024-10-31| string| Oct| |
| +----------+----------+-------------+ |
| |
| Example 2: Extract the month name from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.monthname('ts')).show() |
| +-------------------+----------+-------------+ |
| | ts|typeof(ts)|monthname(ts)| |
| +-------------------+----------+-------------+ |
| |2015-04-08 13:08:15| string| Apr| |
| |2024-10-31 10:09:16| string| Oct| |
| +-------------------+----------+-------------+ |
| |
| Example 3: Extract the month name from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.monthname('dt')).show() |
| +----------+----------+-------------+ |
| | dt|typeof(dt)|monthname(dt)| |
| +----------+----------+-------------+ |
| |2015-04-08| date| Apr| |
| |2024-10-31| date| Oct| |
| +----------+----------+-------------+ |
| |
| Example 4: Extract the month name from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.monthname('ts')).show() |
| +-------------------+----------+-------------+ |
| | ts|typeof(ts)|monthname(ts)| |
| +-------------------+----------+-------------+ |
| |2015-04-08 13:08:15| timestamp| Apr| |
| |2024-10-31 10:09:16| timestamp| Oct| |
| +-------------------+----------+-------------+ |
| """ |
| return _invoke_function_over_columns("monthname", col) |
| |
| |
| @_try_remote_functions |
| def dayname(col: "ColumnOrName") -> Column: |
| """ |
| Date and Timestamp Function: Returns the three-letter abbreviated day name from the given date. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target date/timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the three-letter abbreviation of day name for date/timestamp (Mon, Tue, Wed...) |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.monthname` |
| |
| Examples |
| -------- |
| Example 1: Extract the weekday name from a string column representing dates |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',), ('2024-10-31',)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayname('dt')).show() |
| +----------+----------+-----------+ |
| | dt|typeof(dt)|dayname(dt)| |
| +----------+----------+-----------+ |
| |2015-04-08| string| Wed| |
| |2024-10-31| string| Thu| |
| +----------+----------+-----------+ |
| |
| Example 2: Extract the weekday name from a string column representing timestamp |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 13:08:15',), ('2024-10-31 10:09:16',)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayname('ts')).show() |
| +-------------------+----------+-----------+ |
| | ts|typeof(ts)|dayname(ts)| |
| +-------------------+----------+-----------+ |
| |2015-04-08 13:08:15| string| Wed| |
| |2024-10-31 10:09:16| string| Thu| |
| +-------------------+----------+-----------+ |
| |
| Example 3: Extract the weekday name from a date column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.date(2015, 4, 8),), |
| ... (datetime.date(2024, 10, 31),)], ['dt']) |
| >>> df.select("*", sf.typeof('dt'), sf.dayname('dt')).show() |
| +----------+----------+-----------+ |
| | dt|typeof(dt)|dayname(dt)| |
| +----------+----------+-----------+ |
| |2015-04-08| date| Wed| |
| |2024-10-31| date| Thu| |
| +----------+----------+-----------+ |
| |
| Example 4: Extract the weekday name from a timestamp column |
| |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (datetime.datetime(2015, 4, 8, 13, 8, 15),), |
| ... (datetime.datetime(2024, 10, 31, 10, 9, 16),)], ['ts']) |
| >>> df.select("*", sf.typeof('ts'), sf.dayname('ts')).show() |
| +-------------------+----------+-----------+ |
| | ts|typeof(ts)|dayname(ts)| |
| +-------------------+----------+-----------+ |
| |2015-04-08 13:08:15| timestamp| Wed| |
| |2024-10-31 10:09:16| timestamp| Thu| |
| +-------------------+----------+-----------+ |
| """ |
| return _invoke_function_over_columns("dayname", col) |
| |
| |
| @_try_remote_functions |
| def extract(field: Column, source: "ColumnOrName") -> Column: |
| """ |
| Extracts a part of the date/timestamp or interval source. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| field : :class:`~pyspark.sql.Column` |
| selects which part of the source should be extracted. |
| source : :class:`~pyspark.sql.Column` or column name |
| a date/timestamp or interval column from where `field` should be extracted. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a part of the date/timestamp or interval source. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.date_part` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) |
| >>> df.select( |
| ... '*', |
| ... sf.extract(sf.lit('YEAR'), 'ts').alias('year'), |
| ... sf.extract(sf.lit('month'), 'ts').alias('month'), |
| ... sf.extract(sf.lit('WEEK'), 'ts').alias('week'), |
| ... sf.extract(sf.lit('D'), df.ts).alias('day'), |
| ... sf.extract(sf.lit('M'), df.ts).alias('minute'), |
| ... sf.extract(sf.lit('S'), df.ts).alias('second') |
| ... ).show() |
| +-------------------+----+-----+----+---+------+---------+ |
| | ts|year|month|week|day|minute| second| |
| +-------------------+----+-----+----+---+------+---------+ |
| |2015-04-08 13:08:15|2015| 4| 15| 8| 8|15.000000| |
| +-------------------+----+-----+----+---+------+---------+ |
| """ |
| return _invoke_function_over_columns("extract", field, source) |
| |
| |
| @_try_remote_functions |
| def date_part(field: Column, source: "ColumnOrName") -> Column: |
| """ |
| Extracts a part of the date/timestamp or interval source. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| field : :class:`~pyspark.sql.Column` |
| selects which part of the source should be extracted, and supported string values |
| are as same as the fields of the equivalent function `extract`. |
| source : :class:`~pyspark.sql.Column` or column name |
| a date/timestamp or interval column from where `field` should be extracted. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a part of the date/timestamp or interval source. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.datepart` |
| :meth:`pyspark.sql.functions.extract` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) |
| >>> df.select( |
| ... '*', |
| ... sf.date_part(sf.lit('YEAR'), 'ts').alias('year'), |
| ... sf.date_part(sf.lit('month'), 'ts').alias('month'), |
| ... sf.date_part(sf.lit('WEEK'), 'ts').alias('week'), |
| ... sf.date_part(sf.lit('D'), df.ts).alias('day'), |
| ... sf.date_part(sf.lit('M'), df.ts).alias('minute'), |
| ... sf.date_part(sf.lit('S'), df.ts).alias('second') |
| ... ).show() |
| +-------------------+----+-----+----+---+------+---------+ |
| | ts|year|month|week|day|minute| second| |
| +-------------------+----+-----+----+---+------+---------+ |
| |2015-04-08 13:08:15|2015| 4| 15| 8| 8|15.000000| |
| +-------------------+----+-----+----+---+------+---------+ |
| """ |
| return _invoke_function_over_columns("date_part", field, source) |
| |
| |
| @_try_remote_functions |
| def datepart(field: Column, source: "ColumnOrName") -> Column: |
| """ |
| Extracts a part of the date/timestamp or interval source. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| field : :class:`~pyspark.sql.Column` |
| selects which part of the source should be extracted, and supported string values |
| are as same as the fields of the equivalent function `extract`. |
| source : :class:`~pyspark.sql.Column` or column name |
| a date/timestamp or interval column from where `field` should be extracted. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a part of the date/timestamp or interval source. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.year` |
| :meth:`pyspark.sql.functions.quarter` |
| :meth:`pyspark.sql.functions.month` |
| :meth:`pyspark.sql.functions.day` |
| :meth:`pyspark.sql.functions.hour` |
| :meth:`pyspark.sql.functions.minute` |
| :meth:`pyspark.sql.functions.second` |
| :meth:`pyspark.sql.functions.date_part` |
| :meth:`pyspark.sql.functions.extract` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts']) |
| >>> df.select( |
| ... '*', |
| ... sf.datepart(sf.lit('YEAR'), 'ts').alias('year'), |
| ... sf.datepart(sf.lit('month'), 'ts').alias('month'), |
| ... sf.datepart(sf.lit('WEEK'), 'ts').alias('week'), |
| ... sf.datepart(sf.lit('D'), df.ts).alias('day'), |
| ... sf.datepart(sf.lit('M'), df.ts).alias('minute'), |
| ... sf.datepart(sf.lit('S'), df.ts).alias('second') |
| ... ).show() |
| +-------------------+----+-----+----+---+------+---------+ |
| | ts|year|month|week|day|minute| second| |
| +-------------------+----+-----+----+---+------+---------+ |
| |2015-04-08 13:08:15|2015| 4| 15| 8| 8|15.000000| |
| +-------------------+----+-----+----+---+------+---------+ |
| """ |
| return _invoke_function_over_columns("datepart", field, source) |
| |
| |
| @_try_remote_functions |
| def make_date(year: "ColumnOrName", month: "ColumnOrName", day: "ColumnOrName") -> Column: |
| """ |
| Returns a column with a date built from the year, month and day columns. |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| year : :class:`~pyspark.sql.Column` or column name |
| The year to build the date |
| month : :class:`~pyspark.sql.Column` or column name |
| The month to build the date |
| day : :class:`~pyspark.sql.Column` or column name |
| The day to build the date |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a date built from given parts. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp` |
| :meth:`pyspark.sql.functions.make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.make_timestamp_ntz` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(2020, 6, 26)], ['Y', 'M', 'D']) |
| >>> df.select('*', sf.make_date(df.Y, 'M', df.D)).show() |
| +----+---+---+------------------+ |
| | Y| M| D|make_date(Y, M, D)| |
| +----+---+---+------------------+ |
| |2020| 6| 26| 2020-06-26| |
| +----+---+---+------------------+ |
| """ |
| return _invoke_function_over_columns("make_date", year, month, day) |
| |
| |
| @_try_remote_functions |
| def date_add(start: "ColumnOrName", days: Union["ColumnOrName", int]) -> Column: |
| """ |
| Returns the date that is `days` days after `start`. If `days` is a negative value |
| then these amount of days will be deducted from `start`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| start : :class:`~pyspark.sql.Column` or column name |
| date column to work on. |
| days : :class:`~pyspark.sql.Column` or column name or int |
| how many days after the given date to calculate. |
| Accepts negative value as well to calculate backwards in time. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a date after/before given number of days. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.dateadd` |
| :meth:`pyspark.sql.functions.date_sub` |
| :meth:`pyspark.sql.functions.datediff` |
| :meth:`pyspark.sql.functions.date_diff` |
| :meth:`pyspark.sql.functions.timestamp_add` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08', 2,)], 'struct<dt:string,a:int>') |
| >>> df.select('*', sf.date_add(df.dt, 1)).show() |
| +----------+---+---------------+ |
| | dt| a|date_add(dt, 1)| |
| +----------+---+---------------+ |
| |2015-04-08| 2| 2015-04-09| |
| +----------+---+---------------+ |
| |
| >>> df.select('*', sf.date_add('dt', 'a')).show() |
| +----------+---+---------------+ |
| | dt| a|date_add(dt, a)| |
| +----------+---+---------------+ |
| |2015-04-08| 2| 2015-04-10| |
| +----------+---+---------------+ |
| |
| >>> df.select('*', sf.date_add('dt', sf.lit(-1))).show() |
| +----------+---+----------------+ |
| | dt| a|date_add(dt, -1)| |
| +----------+---+----------------+ |
| |2015-04-08| 2| 2015-04-07| |
| +----------+---+----------------+ |
| """ |
| days = _enum_to_value(days) |
| days = lit(days) if isinstance(days, int) else days |
| return _invoke_function_over_columns("date_add", start, days) |
| |
| |
| @_try_remote_functions |
| def dateadd(start: "ColumnOrName", days: Union["ColumnOrName", int]) -> Column: |
| """ |
| Returns the date that is `days` days after `start`. If `days` is a negative value |
| then these amount of days will be deducted from `start`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| start : :class:`~pyspark.sql.Column` or column name |
| date column to work on. |
| days : :class:`~pyspark.sql.Column` or column name or int |
| how many days after the given date to calculate. |
| Accepts negative value as well to calculate backwards in time. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a date after/before given number of days. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.date_add` |
| :meth:`pyspark.sql.functions.date_sub` |
| :meth:`pyspark.sql.functions.datediff` |
| :meth:`pyspark.sql.functions.date_diff` |
| :meth:`pyspark.sql.functions.timestamp_add` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08', 2,)], 'struct<dt:string,a:int>') |
| >>> df.select('*', sf.dateadd(df.dt, 1)).show() |
| +----------+---+---------------+ |
| | dt| a|date_add(dt, 1)| |
| +----------+---+---------------+ |
| |2015-04-08| 2| 2015-04-09| |
| +----------+---+---------------+ |
| |
| >>> df.select('*', sf.dateadd('dt', 'a')).show() |
| +----------+---+---------------+ |
| | dt| a|date_add(dt, a)| |
| +----------+---+---------------+ |
| |2015-04-08| 2| 2015-04-10| |
| +----------+---+---------------+ |
| |
| >>> df.select('*', sf.dateadd('dt', sf.lit(-1))).show() |
| +----------+---+----------------+ |
| | dt| a|date_add(dt, -1)| |
| +----------+---+----------------+ |
| |2015-04-08| 2| 2015-04-07| |
| +----------+---+----------------+ |
| """ |
| days = _enum_to_value(days) |
| days = lit(days) if isinstance(days, int) else days |
| return _invoke_function_over_columns("dateadd", start, days) |
| |
| |
| @_try_remote_functions |
| def date_sub(start: "ColumnOrName", days: Union["ColumnOrName", int]) -> Column: |
| """ |
| Returns the date that is `days` days before `start`. If `days` is a negative value |
| then these amount of days will be added to `start`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| start : :class:`~pyspark.sql.Column` or column name |
| date column to work on. |
| days : :class:`~pyspark.sql.Column` or column name or int |
| how many days before the given date to calculate. |
| Accepts negative value as well to calculate forward in time. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a date before/after given number of days. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.dateadd` |
| :meth:`pyspark.sql.functions.date_add` |
| :meth:`pyspark.sql.functions.datediff` |
| :meth:`pyspark.sql.functions.date_diff` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08', 2,)], 'struct<dt:string,a:int>') |
| >>> df.select('*', sf.date_sub(df.dt, 1)).show() |
| +----------+---+---------------+ |
| | dt| a|date_sub(dt, 1)| |
| +----------+---+---------------+ |
| |2015-04-08| 2| 2015-04-07| |
| +----------+---+---------------+ |
| |
| >>> df.select('*', sf.date_sub('dt', 'a')).show() |
| +----------+---+---------------+ |
| | dt| a|date_sub(dt, a)| |
| +----------+---+---------------+ |
| |2015-04-08| 2| 2015-04-06| |
| +----------+---+---------------+ |
| |
| >>> df.select('*', sf.date_sub('dt', sf.lit(-1))).show() |
| +----------+---+----------------+ |
| | dt| a|date_sub(dt, -1)| |
| +----------+---+----------------+ |
| |2015-04-08| 2| 2015-04-09| |
| +----------+---+----------------+ |
| """ |
| days = _enum_to_value(days) |
| days = lit(days) if isinstance(days, int) else days |
| return _invoke_function_over_columns("date_sub", start, days) |
| |
| |
| @_try_remote_functions |
| def datediff(end: "ColumnOrName", start: "ColumnOrName") -> Column: |
| """ |
| Returns the number of days from `start` to `end`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| end : :class:`~pyspark.sql.Column` or column name |
| to date column to work on. |
| start : :class:`~pyspark.sql.Column` or column name |
| from date column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| difference in days between two dates. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.dateadd` |
| :meth:`pyspark.sql.functions.date_add` |
| :meth:`pyspark.sql.functions.date_sub` |
| :meth:`pyspark.sql.functions.date_diff` |
| :meth:`pyspark.sql.functions.timestamp_diff` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2']) |
| >>> df.select('*', sf.datediff('d1', 'd2')).show() |
| +----------+----------+----------------+ |
| | d1| d2|datediff(d1, d2)| |
| +----------+----------+----------------+ |
| |2015-04-08|2015-05-10| -32| |
| +----------+----------+----------------+ |
| |
| >>> df.select('*', sf.datediff(df.d2, df.d1)).show() |
| +----------+----------+----------------+ |
| | d1| d2|datediff(d2, d1)| |
| +----------+----------+----------------+ |
| |2015-04-08|2015-05-10| 32| |
| +----------+----------+----------------+ |
| """ |
| return _invoke_function_over_columns("datediff", end, start) |
| |
| |
| @_try_remote_functions |
| def date_diff(end: "ColumnOrName", start: "ColumnOrName") -> Column: |
| """ |
| Returns the number of days from `start` to `end`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| end : :class:`~pyspark.sql.Column` or column name |
| to date column to work on. |
| start : :class:`~pyspark.sql.Column` or column name |
| from date column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| difference in days between two dates. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.dateadd` |
| :meth:`pyspark.sql.functions.date_add` |
| :meth:`pyspark.sql.functions.date_sub` |
| :meth:`pyspark.sql.functions.datediff` |
| :meth:`pyspark.sql.functions.timestamp_diff` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2']) |
| >>> df.select('*', sf.date_diff('d1', 'd2')).show() |
| +----------+----------+-----------------+ |
| | d1| d2|date_diff(d1, d2)| |
| +----------+----------+-----------------+ |
| |2015-04-08|2015-05-10| -32| |
| +----------+----------+-----------------+ |
| |
| >>> df.select('*', sf.date_diff(df.d2, df.d1)).show() |
| +----------+----------+-----------------+ |
| | d1| d2|date_diff(d2, d1)| |
| +----------+----------+-----------------+ |
| |2015-04-08|2015-05-10| 32| |
| +----------+----------+-----------------+ |
| """ |
| return _invoke_function_over_columns("date_diff", end, start) |
| |
| |
| @_try_remote_functions |
| def date_from_unix_date(days: "ColumnOrName") -> Column: |
| """ |
| Create date from the number of `days` since 1970-01-01. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| days : :class:`~pyspark.sql.Column` or column name |
| the target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the date from the number of days since 1970-01-01. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.from_unixtime` |
| :meth:`pyspark.sql.functions.unix_date` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(4).select('*', sf.date_from_unix_date('id')).show() |
| +---+-----------------------+ |
| | id|date_from_unix_date(id)| |
| +---+-----------------------+ |
| | 0| 1970-01-01| |
| | 1| 1970-01-02| |
| | 2| 1970-01-03| |
| | 3| 1970-01-04| |
| +---+-----------------------+ |
| """ |
| return _invoke_function_over_columns("date_from_unix_date", days) |
| |
| |
| @_try_remote_functions |
| def add_months(start: "ColumnOrName", months: Union["ColumnOrName", int]) -> Column: |
| """ |
| Returns the date that is `months` months after `start`. If `months` is a negative value |
| then these amount of months will be deducted from the `start`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| start : :class:`~pyspark.sql.Column` or column name |
| date column to work on. |
| months : :class:`~pyspark.sql.Column` or column name or int |
| how many months after the given date to calculate. |
| Accepts negative value as well to calculate backwards. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a date after/before given number of months. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.dateadd` |
| :meth:`pyspark.sql.functions.date_add` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08', 2,)], 'struct<dt:string,a:int>') |
| >>> df.select('*', sf.add_months(df.dt, 1)).show() |
| +----------+---+-----------------+ |
| | dt| a|add_months(dt, 1)| |
| +----------+---+-----------------+ |
| |2015-04-08| 2| 2015-05-08| |
| +----------+---+-----------------+ |
| |
| >>> df.select('*', sf.add_months('dt', 'a')).show() |
| +----------+---+-----------------+ |
| | dt| a|add_months(dt, a)| |
| +----------+---+-----------------+ |
| |2015-04-08| 2| 2015-06-08| |
| +----------+---+-----------------+ |
| |
| >>> df.select('*', sf.add_months('dt', sf.lit(-1))).show() |
| +----------+---+------------------+ |
| | dt| a|add_months(dt, -1)| |
| +----------+---+------------------+ |
| |2015-04-08| 2| 2015-03-08| |
| +----------+---+------------------+ |
| """ |
| months = _enum_to_value(months) |
| months = lit(months) if isinstance(months, int) else months |
| return _invoke_function_over_columns("add_months", start, months) |
| |
| |
| @_try_remote_functions |
| def months_between(date1: "ColumnOrName", date2: "ColumnOrName", roundOff: bool = True) -> Column: |
| """ |
| Returns number of months between dates date1 and date2. |
| If date1 is later than date2, then the result is positive. |
| A whole number is returned if both inputs have the same day of month or both are the last day |
| of their respective months. Otherwise, the difference is calculated assuming 31 days per month. |
| The result is rounded off to 8 digits unless `roundOff` is set to `False`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| date1 : :class:`~pyspark.sql.Column` or column name |
| first date column. |
| date2 : :class:`~pyspark.sql.Column` or column name |
| second date column. |
| roundOff : bool, optional |
| whether to round (to 8 digits) the final value or not (default: True). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| number of months between two dates. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['d1', 'd2']) |
| >>> df.select('*', sf.months_between(df.d1, df.d2)).show() |
| +-------------------+----------+----------------------------+ |
| | d1| d2|months_between(d1, d2, true)| |
| +-------------------+----------+----------------------------+ |
| |1997-02-28 10:30:00|1996-10-30| 3.94959677| |
| +-------------------+----------+----------------------------+ |
| |
| >>> df.select('*', sf.months_between('d2', 'd1')).show() |
| +-------------------+----------+----------------------------+ |
| | d1| d2|months_between(d2, d1, true)| |
| +-------------------+----------+----------------------------+ |
| |1997-02-28 10:30:00|1996-10-30| -3.94959677| |
| +-------------------+----------+----------------------------+ |
| |
| >>> df.select('*', sf.months_between('d1', df.d2, False)).show() |
| +-------------------+----------+-----------------------------+ |
| | d1| d2|months_between(d1, d2, false)| |
| +-------------------+----------+-----------------------------+ |
| |1997-02-28 10:30:00|1996-10-30| 3.9495967741935...| |
| +-------------------+----------+-----------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "months_between", _to_java_column(date1), _to_java_column(date2), _enum_to_value(roundOff) |
| ) |
| |
| |
| @_try_remote_functions |
| def to_date(col: "ColumnOrName", format: Optional[str] = None) -> Column: |
| """Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType` |
| using the optionally specified format. Specify formats according to `datetime pattern`_. |
| By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format |
| is omitted. Equivalent to ``col.cast("date")``. |
| |
| .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html |
| |
| .. versionadded:: 2.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to convert. |
| format: literal string, optional |
| format to use to convert date values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| date value as :class:`pyspark.sql.types.DateType` type. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.try_to_timestamp` |
| :meth:`pyspark.sql.functions.date_format` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['ts']) |
| >>> df.select('*', sf.to_date(df.ts)).show() |
| +-------------------+-----------+ |
| | ts|to_date(ts)| |
| +-------------------+-----------+ |
| |1997-02-28 10:30:00| 1997-02-28| |
| +-------------------+-----------+ |
| |
| >>> df.select('*', sf.to_date('ts', 'yyyy-MM-dd HH:mm:ss')).show() |
| +-------------------+--------------------------------+ |
| | ts|to_date(ts, yyyy-MM-dd HH:mm:ss)| |
| +-------------------+--------------------------------+ |
| |1997-02-28 10:30:00| 1997-02-28| |
| +-------------------+--------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if format is None: |
| return _invoke_function_over_columns("to_date", col) |
| else: |
| return _invoke_function("to_date", _to_java_column(col), _enum_to_value(format)) |
| |
| |
| @_try_remote_functions |
| def try_to_date(col: "ColumnOrName", format: Optional[str] = None) -> Column: |
| """This is a special version of `try_to_date` that performs the same operation, but returns a |
| NULL value instead of raising an error if date cannot be created. |
| |
| .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to convert. |
| format: literal string, optional |
| format to use to convert date values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| date value as :class:`pyspark.sql.types.DateType` type. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.try_to_timestamp` |
| :meth:`pyspark.sql.functions.date_format` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28',)], ['ts']) |
| >>> df.select('*', sf.try_to_date(df.ts)).show() |
| +----------+---------------+ |
| | ts|try_to_date(ts)| |
| +----------+---------------+ |
| |1997-02-28| 1997-02-28| |
| +----------+---------------+ |
| |
| >>> df.select('*', sf.try_to_date('ts', 'yyyy-MM-dd')).show() |
| +----------+---------------------------+ |
| | ts|try_to_date(ts, yyyy-MM-dd)| |
| +----------+---------------------------+ |
| |1997-02-28| 1997-02-28| |
| +----------+---------------------------+ |
| |
| >>> df = spark.createDataFrame([('foo',)], ['ts']) |
| >>> df.select(sf.try_to_date(df.ts)).show() |
| +---------------+ |
| |try_to_date(ts)| |
| +---------------+ |
| | NULL| |
| +---------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if format is None: |
| return _invoke_function_over_columns("try_to_date", col) |
| else: |
| return _invoke_function("try_to_date", _to_java_column(col), _enum_to_value(format)) |
| |
| |
| @_try_remote_functions |
| def unix_date(col: "ColumnOrName") -> Column: |
| """Returns the number of days since 1970-01-01. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to convert. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of days since 1970-01-01. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.date_from_unix_date` |
| :meth:`pyspark.sql.functions.unix_seconds` |
| :meth:`pyspark.sql.functions.unix_millis` |
| :meth:`pyspark.sql.functions.unix_micros` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1970-01-02',), ('2022-01-02',)], ['dt']) |
| >>> df.select('*', sf.unix_date(sf.to_date('dt'))).show() |
| +----------+----------------------+ |
| | dt|unix_date(to_date(dt))| |
| +----------+----------------------+ |
| |1970-01-02| 1| |
| |2022-01-02| 18994| |
| +----------+----------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function_over_columns("unix_date", col) |
| |
| |
| @_try_remote_functions |
| def unix_micros(col: "ColumnOrName") -> Column: |
| """Returns the number of microseconds since 1970-01-01 00:00:00 UTC. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to convert. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of microseconds since 1970-01-01 00:00:00 UTC. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.unix_date` |
| :meth:`pyspark.sql.functions.unix_seconds` |
| :meth:`pyspark.sql.functions.unix_millis` |
| :meth:`pyspark.sql.functions.timestamp_micros` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-07-22 10:00:00',), ('2022-10-09 11:12:13',)], ['ts']) |
| >>> df.select('*', sf.unix_micros(sf.to_timestamp('ts'))).show() |
| +-------------------+-----------------------------+ |
| | ts|unix_micros(to_timestamp(ts))| |
| +-------------------+-----------------------------+ |
| |2015-07-22 10:00:00| 1437584400000000| |
| |2022-10-09 11:12:13| 1665339133000000| |
| +-------------------+-----------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function_over_columns("unix_micros", col) |
| |
| |
| @_try_remote_functions |
| def unix_millis(col: "ColumnOrName") -> Column: |
| """Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. |
| Truncates higher levels of precision. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to convert. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of milliseconds since 1970-01-01 00:00:00 UTC. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.unix_date` |
| :meth:`pyspark.sql.functions.unix_seconds` |
| :meth:`pyspark.sql.functions.unix_micros` |
| :meth:`pyspark.sql.functions.timestamp_millis` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-07-22 10:00:00',), ('2022-10-09 11:12:13',)], ['ts']) |
| >>> df.select('*', sf.unix_millis(sf.to_timestamp('ts'))).show() |
| +-------------------+-----------------------------+ |
| | ts|unix_millis(to_timestamp(ts))| |
| +-------------------+-----------------------------+ |
| |2015-07-22 10:00:00| 1437584400000| |
| |2022-10-09 11:12:13| 1665339133000| |
| +-------------------+-----------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function_over_columns("unix_millis", col) |
| |
| |
| @_try_remote_functions |
| def unix_seconds(col: "ColumnOrName") -> Column: |
| """Returns the number of seconds since 1970-01-01 00:00:00 UTC. |
| Truncates higher levels of precision. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to convert. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of seconds since 1970-01-01 00:00:00 UTC. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.unix_date` |
| :meth:`pyspark.sql.functions.unix_millis` |
| :meth:`pyspark.sql.functions.unix_micros` |
| :meth:`pyspark.sql.functions.from_unixtime` |
| :meth:`pyspark.sql.functions.timestamp_seconds` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-07-22 10:00:00',), ('2022-10-09 11:12:13',)], ['ts']) |
| >>> df.select('*', sf.unix_seconds(sf.to_timestamp('ts'))).show() |
| +-------------------+------------------------------+ |
| | ts|unix_seconds(to_timestamp(ts))| |
| +-------------------+------------------------------+ |
| |2015-07-22 10:00:00| 1437584400| |
| |2022-10-09 11:12:13| 1665339133| |
| +-------------------+------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function_over_columns("unix_seconds", col) |
| |
| |
| @overload |
| def to_time(str: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def to_time(str: "ColumnOrName", format: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def to_time(str: "ColumnOrName", format: Optional["ColumnOrName"] = None) -> Column: |
| """Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimeType` using the |
| optionally specified format. Specify formats according to `datetime pattern`_. By default, it |
| follows casting rules to :class:`pyspark.sql.types.TimeType` if the format is omitted. |
| Equivalent to ``col.cast("time")``. |
| |
| .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| string to be parsed to time. |
| format: :class:`~pyspark.sql.Column` or column name, optional |
| time format pattern to follow. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| time value as :class:`pyspark.sql.types.TimeType` type. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.try_to_time` |
| |
| Examples |
| -------- |
| Example 1: Convert string to a time |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("10:30:00",)], ["str"]) |
| >>> df.select(sf.to_time(df.str)).show() |
| +------------+ |
| |to_time(str)| |
| +------------+ |
| | 10:30:00| |
| +------------+ |
| |
| Example 2: Convert string to a time with a format |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("10:30:00", "HH:mm:ss")], ["str", "format"]) |
| >>> df.select(sf.to_time(df.str, df.format)).show() |
| +--------------------+ |
| |to_time(str, format)| |
| +--------------------+ |
| | 10:30:00| |
| +--------------------+ |
| """ |
| if format is None: |
| return _invoke_function_over_columns("to_time", str) |
| else: |
| return _invoke_function_over_columns("to_time", str, format) |
| |
| |
| @overload |
| def to_timestamp(col: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def to_timestamp(col: "ColumnOrName", format: str) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def to_timestamp(col: "ColumnOrName", format: Optional[str] = None) -> Column: |
| """Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType` |
| using the optionally specified format. Specify formats according to `datetime pattern`_. |
| By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format |
| is omitted. Equivalent to ``col.cast("timestamp")``. |
| |
| .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html |
| |
| .. versionadded:: 2.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column values to convert. |
| format: literal string, optional |
| format to use to convert timestamp values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| timestamp value as :class:`pyspark.sql.types.TimestampType` type. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_date` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.to_unix_timestamp` |
| :meth:`pyspark.sql.functions.try_to_timestamp` |
| :meth:`pyspark.sql.functions.date_format` |
| |
| Examples |
| -------- |
| Example 1: Convert string to a timestamp |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) |
| >>> df.select(sf.to_timestamp(df.t)).show() |
| +-------------------+ |
| | to_timestamp(t)| |
| +-------------------+ |
| |1997-02-28 10:30:00| |
| +-------------------+ |
| |
| Example 2: Convert string to a timestamp with a format |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) |
| >>> df.select(sf.to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss')).show() |
| +------------------------------------+ |
| |to_timestamp(t, yyyy-MM-dd HH:mm:ss)| |
| +------------------------------------+ |
| | 1997-02-28 10:30:00| |
| +------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if format is None: |
| return _invoke_function_over_columns("to_timestamp", col) |
| else: |
| return _invoke_function("to_timestamp", _to_java_column(col), _enum_to_value(format)) |
| |
| |
| @overload |
| def try_to_time(str: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def try_to_time(str: "ColumnOrName", format: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def try_to_time(str: "ColumnOrName", format: Optional["ColumnOrName"] = None) -> Column: |
| """Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimeType` using the |
| optionally specified format. Specify formats according to `datetime pattern`_. By default, it |
| follows casting rules to :class:`pyspark.sql.types.TimeType` if the format is omitted. |
| Equivalent to ``col.cast("time")``. The function always returns null on an invalid input. |
| |
| .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| string to be parsed to time. |
| format: :class:`~pyspark.sql.Column` or column name, optional |
| time format pattern to follow. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| time value as :class:`pyspark.sql.types.TimeType` type. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_time` |
| :meth:`pyspark.sql.functions.try_to_timestamp` |
| |
| Examples |
| -------- |
| Example 1: Convert string to a time |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("10:30:00",)], ["str"]) |
| >>> df.select(sf.try_to_time(df.str).alias("time")).show() |
| +--------+ |
| | time| |
| +--------+ |
| |10:30:00| |
| +--------+ |
| |
| Example 2: Convert string to a time with a format |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("10:30:00", "HH:mm:ss")], ["str", "format"]) |
| >>> df.select(sf.try_to_time(df.str, df.format).alias("time")).show() |
| +--------+ |
| | time| |
| +--------+ |
| |10:30:00| |
| +--------+ |
| |
| Example 3: Converion failure results in NULL |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("malformed",)], ["str"]) |
| >>> df.select(sf.try_to_time(df.str).alias("time")).show() |
| +----+ |
| |time| |
| +----+ |
| |NULL| |
| +----+ |
| """ |
| if format is None: |
| return _invoke_function_over_columns("try_to_time", str) |
| else: |
| return _invoke_function_over_columns("try_to_time", str, format) |
| |
| |
| @_try_remote_functions |
| def try_to_timestamp(col: "ColumnOrName", format: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| Parses the `col` with the `format` to a timestamp. The function always |
| returns null on an invalid input with/without ANSI SQL mode enabled. The result data type is |
| consistent with the value of configuration `spark.sql.timestampType`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column values to convert. |
| format: literal string, optional |
| format to use to convert timestamp values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_date` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.date_format` |
| |
| Examples |
| -------- |
| Example 1: Convert string to a timestamp |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) |
| >>> df.select(sf.try_to_timestamp(df.t)).show() |
| +-------------------+ |
| |try_to_timestamp(t)| |
| +-------------------+ |
| |1997-02-28 10:30:00| |
| +-------------------+ |
| |
| Example 2: Convert string to a timestamp with a format |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) |
| >>> df.select(sf.try_to_timestamp(df.t, sf.lit('yyyy-MM-dd HH:mm:ss'))).show() |
| +----------------------------------------+ |
| |try_to_timestamp(t, yyyy-MM-dd HH:mm:ss)| |
| +----------------------------------------+ |
| | 1997-02-28 10:30:00| |
| +----------------------------------------+ |
| |
| Example 3: Converion failure results in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... df = spark.createDataFrame([('malformed',)], ['t']) |
| ... df.select(sf.try_to_timestamp(df.t)).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +-------------------+ |
| |try_to_timestamp(t)| |
| +-------------------+ |
| | NULL| |
| +-------------------+ |
| """ |
| if format is not None: |
| return _invoke_function_over_columns("try_to_timestamp", col, format) |
| else: |
| return _invoke_function_over_columns("try_to_timestamp", col) |
| |
| |
| @_try_remote_functions |
| def xpath(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns a string array of values within the nodes of xml that match the XPath expression. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>',)], ['x']) |
| >>> df.select(sf.xpath(df.x, sf.lit('a/b/text()'))).show() |
| +--------------------+ |
| |xpath(x, a/b/text())| |
| +--------------------+ |
| | [b1, b2, b3]| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("xpath", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_boolean(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns true if the XPath expression evaluates to true, or if a matching node is found. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>1</b></a>',)], ['x']) |
| >>> df.select(sf.xpath_boolean(df.x, sf.lit('a/b'))).show() |
| +---------------------+ |
| |xpath_boolean(x, a/b)| |
| +---------------------+ |
| | true| |
| +---------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_boolean", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_double(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns a double value, the value zero if no match is found, |
| or NaN if a match is found but the value is non-numeric. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x']) |
| >>> df.select(sf.xpath_double(df.x, sf.lit('sum(a/b)'))).show() |
| +-------------------------+ |
| |xpath_double(x, sum(a/b))| |
| +-------------------------+ |
| | 3.0| |
| +-------------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_double", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_number(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns a double value, the value zero if no match is found, |
| or NaN if a match is found but the value is non-numeric. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [('<a><b>1</b><b>2</b></a>',)], ['x'] |
| ... ).select(sf.xpath_number('x', sf.lit('sum(a/b)'))).show() |
| +-------------------------+ |
| |xpath_number(x, sum(a/b))| |
| +-------------------------+ |
| | 3.0| |
| +-------------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_number", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_float(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns a float value, the value zero if no match is found, |
| or NaN if a match is found but the value is non-numeric. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x']) |
| >>> df.select(sf.xpath_float(df.x, sf.lit('sum(a/b)'))).show() |
| +------------------------+ |
| |xpath_float(x, sum(a/b))| |
| +------------------------+ |
| | 3.0| |
| +------------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_float", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_int(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns an integer value, or the value zero if no match is found, |
| or a match is found but the value is non-numeric. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x']) |
| >>> df.select(sf.xpath_int(df.x, sf.lit('sum(a/b)'))).show() |
| +----------------------+ |
| |xpath_int(x, sum(a/b))| |
| +----------------------+ |
| | 3| |
| +----------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_int", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_long(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns a long integer value, or the value zero if no match is found, |
| or a match is found but the value is non-numeric. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x']) |
| >>> df.select(sf.xpath_long(df.x, sf.lit('sum(a/b)'))).show() |
| +-----------------------+ |
| |xpath_long(x, sum(a/b))| |
| +-----------------------+ |
| | 3| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_long", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_short(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns a short integer value, or the value zero if no match is found, |
| or a match is found but the value is non-numeric. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x']) |
| >>> df.select(sf.xpath_short(df.x, sf.lit('sum(a/b)'))).show() |
| +------------------------+ |
| |xpath_short(x, sum(a/b))| |
| +------------------------+ |
| | 3| |
| +------------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_short", xml, path) |
| |
| |
| @_try_remote_functions |
| def xpath_string(xml: "ColumnOrName", path: "ColumnOrName") -> Column: |
| """ |
| Returns the text contents of the first xml node that matches the XPath expression. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('<a><b>b</b><c>cc</c></a>',)], ['x']) |
| >>> df.select(sf.xpath_string(df.x, sf.lit('a/c'))).show() |
| +--------------------+ |
| |xpath_string(x, a/c)| |
| +--------------------+ |
| | cc| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("xpath_string", xml, path) |
| |
| |
| @_try_remote_functions |
| def trunc(date: "ColumnOrName", format: str) -> Column: |
| """ |
| Returns date truncated to the unit specified by the format. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| date : :class:`~pyspark.sql.Column` or column name |
| input column of values to truncate. |
| format : literal string |
| 'year', 'yyyy', 'yy' to truncate by year, |
| or 'month', 'mon', 'mm' to truncate by month |
| Other options are: 'week', 'quarter' |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| truncated date. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.date_trunc` |
| :meth:`pyspark.sql.functions.time_trunc` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28',)], ['dt']) |
| >>> df.select('*', sf.trunc(df.dt, 'year')).show() |
| +----------+---------------+ |
| | dt|trunc(dt, year)| |
| +----------+---------------+ |
| |1997-02-28| 1997-01-01| |
| +----------+---------------+ |
| |
| >>> df.select('*', sf.trunc('dt', 'mon')).show() |
| +----------+--------------+ |
| | dt|trunc(dt, mon)| |
| +----------+--------------+ |
| |1997-02-28| 1997-02-01| |
| +----------+--------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("trunc", _to_java_column(date), _enum_to_value(format)) |
| |
| |
| @_try_remote_functions |
| def date_trunc(format: str, timestamp: "ColumnOrName") -> Column: |
| """ |
| Returns timestamp truncated to the unit specified by the format. |
| |
| .. versionadded:: 2.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| format : literal string |
| 'year', 'yyyy', 'yy' to truncate by year, |
| 'month', 'mon', 'mm' to truncate by month, |
| 'day', 'dd' to truncate by day, |
| Other options are: |
| 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| input column of values to truncate. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| truncated timestamp. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.trunc` |
| :meth:`pyspark.sql.functions.time_trunc` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['ts']) |
| >>> df.select('*', sf.date_trunc('year', df.ts)).show() |
| +-------------------+--------------------+ |
| | ts|date_trunc(year, ts)| |
| +-------------------+--------------------+ |
| |1997-02-28 05:02:11| 1997-01-01 00:00:00| |
| +-------------------+--------------------+ |
| |
| >>> df.select('*', sf.date_trunc('mon', 'ts')).show() |
| +-------------------+-------------------+ |
| | ts|date_trunc(mon, ts)| |
| +-------------------+-------------------+ |
| |1997-02-28 05:02:11|1997-02-01 00:00:00| |
| +-------------------+-------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("date_trunc", _enum_to_value(format), _to_java_column(timestamp)) |
| |
| |
| @_try_remote_functions |
| def next_day(date: "ColumnOrName", dayOfWeek: str) -> Column: |
| """ |
| Returns the first date which is later than the value of the date column |
| based on second `week day` argument. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| date : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| dayOfWeek : literal string |
| day of the week, case-insensitive, accepts: |
| "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column of computed results. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2015-07-27',)], ['dt']) |
| >>> df.select('*', sf.next_day(df.dt, 'Sun')).show() |
| +----------+-----------------+ |
| | dt|next_day(dt, Sun)| |
| +----------+-----------------+ |
| |2015-07-27| 2015-08-02| |
| +----------+-----------------+ |
| |
| >>> df.select('*', sf.next_day('dt', 'Sat')).show() |
| +----------+-----------------+ |
| | dt|next_day(dt, Sat)| |
| +----------+-----------------+ |
| |2015-07-27| 2015-08-01| |
| +----------+-----------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("next_day", _to_java_column(date), _enum_to_value(dayOfWeek)) |
| |
| |
| @_try_remote_functions |
| def last_day(date: "ColumnOrName") -> Column: |
| """ |
| Returns the last day of the month which the given date belongs to. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| date : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| last day of the month. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('1997-02-10',)], ['dt']) |
| >>> df.select('*', sf.last_day(df.dt)).show() |
| +----------+------------+ |
| | dt|last_day(dt)| |
| +----------+------------+ |
| |1997-02-10| 1997-02-28| |
| +----------+------------+ |
| |
| >>> df.select('*', sf.last_day('dt')).show() |
| +----------+------------+ |
| | dt|last_day(dt)| |
| +----------+------------+ |
| |1997-02-10| 1997-02-28| |
| +----------+------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("last_day", _to_java_column(date)) |
| |
| |
| @_try_remote_functions |
| def from_unixtime(timestamp: "ColumnOrName", format: str = "yyyy-MM-dd HH:mm:ss") -> Column: |
| """ |
| Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string |
| representing the timestamp of that moment in the current system time zone in the given |
| format. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| column of unix time values. |
| format : literal string, optional |
| format to use to convert to (default: yyyy-MM-dd HH:mm:ss) |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| formatted timestamp as string. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.date_from_unix_date` |
| :meth:`pyspark.sql.functions.unix_seconds` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1428476400,)], ['unix_time']) |
| >>> df.select('*', sf.from_unixtime('unix_time')).show() |
| +----------+---------------------------------------------+ |
| | unix_time|from_unixtime(unix_time, yyyy-MM-dd HH:mm:ss)| |
| +----------+---------------------------------------------+ |
| |1428476400| 2015-04-08 00:00:00| |
| +----------+---------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("from_unixtime", _to_java_column(timestamp), _enum_to_value(format)) |
| |
| |
| @overload |
| def unix_timestamp(timestamp: "ColumnOrName", format: str = ...) -> Column: |
| ... |
| |
| |
| @overload |
| def unix_timestamp() -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def unix_timestamp( |
| timestamp: Optional["ColumnOrName"] = None, format: str = "yyyy-MM-dd HH:mm:ss" |
| ) -> Column: |
| """ |
| Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default) |
| to Unix time stamp (in seconds), using the default timezone and the default |
| locale, returns null if failed. |
| |
| if `timestamp` is None, then it returns current timestamp. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name, optional |
| timestamps of string values. |
| format : literal string, optional |
| alternative format to use for converting (default: yyyy-MM-dd HH:mm:ss). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| unix time as long integer. |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Returns the current timestamp in UNIX. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.unix_timestamp()).show() # doctest: +SKIP |
| +----------+ |
| | unix_time| |
| +----------+ |
| |1702018137| |
| +----------+ |
| |
| Example 2: Using default format 'yyyy-MM-dd HH:mm:ss' parses the timestamp string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) |
| >>> df.select('*', sf.unix_timestamp('ts')).show() |
| +-------------------+---------------------------------------+ |
| | ts|unix_timestamp(ts, yyyy-MM-dd HH:mm:ss)| |
| +-------------------+---------------------------------------+ |
| |2015-04-08 12:12:12| 1428520332| |
| +-------------------+---------------------------------------+ |
| |
| Example 3: Using user-specified format 'yyyy-MM-dd' parses the timestamp string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) |
| >>> df.select('*', sf.unix_timestamp('dt', 'yyyy-MM-dd')).show() |
| +----------+------------------------------+ |
| | dt|unix_timestamp(dt, yyyy-MM-dd)| |
| +----------+------------------------------+ |
| |2015-04-08| 1428476400| |
| +----------+------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if timestamp is None: |
| return _invoke_function("unix_timestamp") |
| return _invoke_function("unix_timestamp", _to_java_column(timestamp), _enum_to_value(format)) |
| |
| |
| @_try_remote_functions |
| def from_utc_timestamp(timestamp: "ColumnOrName", tz: Union[Column, str]) -> Column: |
| """ |
| This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function |
| takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and |
| renders that timestamp as a timestamp in the given time zone. |
| |
| However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not |
| timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to |
| the given timezone. |
| |
| This function may return confusing result if the input is a string with timezone, e.g. |
| '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp |
| according to the timezone in the string, and finally display the result by converting the |
| timestamp to string according to the session local timezone. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| the column that contains timestamps |
| tz : :class:`~pyspark.sql.Column` or literal string |
| A string detailing the time zone ID that the input should be adjusted to. It should |
| be in the format of either region-based zone IDs or zone offsets. Region IDs must |
| have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in |
| the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are |
| supported as aliases of '+00:00'. Other short names are not recommended to use |
| because they can be ambiguous. |
| |
| .. versionchanged:: 2.4 |
| `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| timestamp value represented in given timezone. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) |
| >>> df.select('*', sf.from_utc_timestamp('ts', 'PST')).show() |
| +-------------------+---+---------------------------+ |
| | ts| tz|from_utc_timestamp(ts, PST)| |
| +-------------------+---+---------------------------+ |
| |1997-02-28 10:30:00|JST| 1997-02-28 02:30:00| |
| +-------------------+---+---------------------------+ |
| |
| >>> df.select('*', sf.from_utc_timestamp(df.ts, df.tz)).show() |
| +-------------------+---+--------------------------+ |
| | ts| tz|from_utc_timestamp(ts, tz)| |
| +-------------------+---+--------------------------+ |
| |1997-02-28 10:30:00|JST| 1997-02-28 19:30:00| |
| +-------------------+---+--------------------------+ |
| """ |
| return _invoke_function_over_columns("from_utc_timestamp", timestamp, lit(tz)) |
| |
| |
| @_try_remote_functions |
| def to_utc_timestamp(timestamp: "ColumnOrName", tz: Union[Column, str]) -> Column: |
| """ |
| This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function |
| takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given |
| timezone, and renders that timestamp as a timestamp in UTC. |
| |
| However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not |
| timezone-agnostic. So in Spark this function just shift the timestamp value from the given |
| timezone to UTC timezone. |
| |
| This function may return confusing result if the input is a string with timezone, e.g. |
| '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp |
| according to the timezone in the string, and finally display the result by converting the |
| timestamp to string according to the session local timezone. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| the column that contains timestamps |
| tz : :class:`~pyspark.sql.Column` or literal string |
| A string detailing the time zone ID that the input should be adjusted to. It should |
| be in the format of either region-based zone IDs or zone offsets. Region IDs must |
| have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in |
| the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are |
| supported as aliases of '+00:00'. Other short names are not recommended to use |
| because they can be ambiguous. |
| |
| .. versionchanged:: 2.4.0 |
| `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| timestamp value represented in UTC timezone. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.from_utc_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz']) |
| >>> df.select('*', sf.to_utc_timestamp('ts', "PST")).show() |
| +-------------------+---+-------------------------+ |
| | ts| tz|to_utc_timestamp(ts, PST)| |
| +-------------------+---+-------------------------+ |
| |1997-02-28 10:30:00|JST| 1997-02-28 18:30:00| |
| +-------------------+---+-------------------------+ |
| |
| >>> df.select('*', sf.to_utc_timestamp(df.ts, df.tz)).show() |
| +-------------------+---+------------------------+ |
| | ts| tz|to_utc_timestamp(ts, tz)| |
| +-------------------+---+------------------------+ |
| |1997-02-28 10:30:00|JST| 1997-02-28 01:30:00| |
| +-------------------+---+------------------------+ |
| """ |
| return _invoke_function_over_columns("to_utc_timestamp", timestamp, lit(tz)) |
| |
| |
| @_try_remote_functions |
| def timestamp_seconds(col: "ColumnOrName") -> Column: |
| """ |
| Converts the number of seconds from the Unix epoch (1970-01-01T00:00:00Z) |
| to a timestamp. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| unix time values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| converted timestamp value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.timestamp_millis` |
| :meth:`pyspark.sql.functions.timestamp_micros` |
| :meth:`pyspark.sql.functions.unix_seconds` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "UTC") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1230219000,), (1280219000,)], ['seconds']) |
| >>> df.select('*', sf.timestamp_seconds('seconds')).show() |
| +----------+--------------------------+ |
| | seconds|timestamp_seconds(seconds)| |
| +----------+--------------------------+ |
| |1230219000| 2008-12-25 15:30:00| |
| |1280219000| 2010-07-27 08:23:20| |
| +----------+--------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| |
| return _invoke_function_over_columns("timestamp_seconds", col) |
| |
| |
| @_try_remote_functions |
| def time_trunc(unit: "ColumnOrName", time: "ColumnOrName") -> Column: |
| """ |
| Returns `time` truncated to the `unit`. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| unit : :class:`~pyspark.sql.Column` or column name |
| The unit to truncate the time to. Supported units are: "HOUR", "MINUTE", "SECOND", |
| "MILLISECOND", and "MICROSECOND". The unit is case-insensitive. |
| time : :class:`~pyspark.sql.Column` or column name |
| A time to truncate. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A time truncated to the specified unit. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.trunc` |
| :meth:`pyspark.sql.functions.date_trunc` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("HOUR", "13:08:15")], |
| ... ['unit', 'time']).withColumn("time", sf.col("time").cast("time")) |
| >>> df.select('*', sf.time_trunc('unit', 'time')).show() |
| +----+--------+----------------------+ |
| |unit| time|time_trunc(unit, time)| |
| +----+--------+----------------------+ |
| |HOUR|13:08:15| 13:00:00| |
| +----+--------+----------------------+ |
| """ |
| return _invoke_function_over_columns("time_trunc", unit, time) |
| |
| |
| @_try_remote_functions |
| def timestamp_millis(col: "ColumnOrName") -> Column: |
| """ |
| Creates timestamp from the number of milliseconds since UTC epoch. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| unix time values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| converted timestamp value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.timestamp_seconds` |
| :meth:`pyspark.sql.functions.timestamp_micros` |
| :meth:`pyspark.sql.functions.unix_millis` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "UTC") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1230219000,), (1280219000,)], ['millis']) |
| >>> df.select('*', sf.timestamp_millis('millis')).show() |
| +----------+------------------------+ |
| | millis|timestamp_millis(millis)| |
| +----------+------------------------+ |
| |1230219000| 1970-01-15 05:43:39| |
| |1280219000| 1970-01-15 19:36:59| |
| +----------+------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function_over_columns("timestamp_millis", col) |
| |
| |
| @_try_remote_functions |
| def timestamp_micros(col: "ColumnOrName") -> Column: |
| """ |
| Creates timestamp from the number of microseconds since UTC epoch. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| unix time values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| converted timestamp value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.timestamp_seconds` |
| :meth:`pyspark.sql.functions.timestamp_millis` |
| :meth:`pyspark.sql.functions.unix_micros` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "UTC") |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1230219000,), (1280219000,)], ['micros']) |
| >>> df.select('*', sf.timestamp_micros('micros')).show(truncate=False) |
| +----------+------------------------+ |
| |micros |timestamp_micros(micros)| |
| +----------+------------------------+ |
| |1230219000|1970-01-01 00:20:30.219 | |
| |1280219000|1970-01-01 00:21:20.219 | |
| +----------+------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| return _invoke_function_over_columns("timestamp_micros", col) |
| |
| |
| @_try_remote_functions |
| def timestamp_diff(unit: str, start: "ColumnOrName", end: "ColumnOrName") -> Column: |
| """ |
| Gets the difference between the timestamps in the specified units by truncating |
| the fraction part. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| unit : literal string |
| This indicates the units of the difference between the given timestamps. |
| Supported options are (case insensitive): "YEAR", "QUARTER", "MONTH", "WEEK", |
| "DAY", "HOUR", "MINUTE", "SECOND", "MILLISECOND" and "MICROSECOND". |
| start : :class:`~pyspark.sql.Column` or column name |
| A timestamp which the expression subtracts from `endTimestamp`. |
| end : :class:`~pyspark.sql.Column` or column name |
| A timestamp from which the expression subtracts `startTimestamp`. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the difference between the timestamps. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.datediff` |
| :meth:`pyspark.sql.functions.date_diff` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), datetime.datetime(2024, 4, 2, 9, 0, 7))], |
| ... ['ts1', 'ts2']) |
| >>> df.select('*', sf.timestamp_diff('year', 'ts1', 'ts2')).show() |
| +-------------------+-------------------+-----------------------------+ |
| | ts1| ts2|timestampdiff(year, ts1, ts2)| |
| +-------------------+-------------------+-----------------------------+ |
| |2016-03-11 09:00:07|2024-04-02 09:00:07| 8| |
| +-------------------+-------------------+-----------------------------+ |
| |
| >>> df.select('*', sf.timestamp_diff('WEEK', 'ts1', 'ts2')).show() |
| +-------------------+-------------------+-----------------------------+ |
| | ts1| ts2|timestampdiff(WEEK, ts1, ts2)| |
| +-------------------+-------------------+-----------------------------+ |
| |2016-03-11 09:00:07|2024-04-02 09:00:07| 420| |
| +-------------------+-------------------+-----------------------------+ |
| |
| >>> df.select('*', sf.timestamp_diff('day', df.ts2, df.ts1)).show() |
| +-------------------+-------------------+----------------------------+ |
| | ts1| ts2|timestampdiff(day, ts2, ts1)| |
| +-------------------+-------------------+----------------------------+ |
| |2016-03-11 09:00:07|2024-04-02 09:00:07| -2944| |
| +-------------------+-------------------+----------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "timestamp_diff", |
| _enum_to_value(unit), |
| _to_java_column(start), |
| _to_java_column(end), |
| ) |
| |
| |
| @_try_remote_functions |
| def timestamp_add(unit: str, quantity: "ColumnOrName", ts: "ColumnOrName") -> Column: |
| """ |
| Gets the difference between the timestamps in the specified units by truncating |
| the fraction part. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| unit : literal string |
| This indicates the units of the difference between the given timestamps. |
| Supported options are (case insensitive): "YEAR", "QUARTER", "MONTH", "WEEK", |
| "DAY", "HOUR", "MINUTE", "SECOND", "MILLISECOND" and "MICROSECOND". |
| quantity : :class:`~pyspark.sql.Column` or column name |
| The number of units of time that you want to add. |
| ts : :class:`~pyspark.sql.Column` or column name |
| A timestamp to which you want to add. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the difference between the timestamps. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.dateadd` |
| :meth:`pyspark.sql.functions.date_add` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 2), |
| ... (datetime.datetime(2024, 4, 2, 9, 0, 7), 3)], ['ts', 'quantity']) |
| >>> df.select('*', sf.timestamp_add('year', 'quantity', 'ts')).show() |
| +-------------------+--------+--------------------------------+ |
| | ts|quantity|timestampadd(year, quantity, ts)| |
| +-------------------+--------+--------------------------------+ |
| |2016-03-11 09:00:07| 2| 2018-03-11 09:00:07| |
| |2024-04-02 09:00:07| 3| 2027-04-02 09:00:07| |
| +-------------------+--------+--------------------------------+ |
| |
| >>> df.select('*', sf.timestamp_add('WEEK', sf.lit(5), df.ts)).show() |
| +-------------------+--------+-------------------------+ |
| | ts|quantity|timestampadd(WEEK, 5, ts)| |
| +-------------------+--------+-------------------------+ |
| |2016-03-11 09:00:07| 2| 2016-04-15 09:00:07| |
| |2024-04-02 09:00:07| 3| 2024-05-07 09:00:07| |
| +-------------------+--------+-------------------------+ |
| |
| >>> df.select('*', sf.timestamp_add('day', sf.lit(-5), 'ts')).show() |
| +-------------------+--------+-------------------------+ |
| | ts|quantity|timestampadd(day, -5, ts)| |
| +-------------------+--------+-------------------------+ |
| |2016-03-11 09:00:07| 2| 2016-03-06 09:00:07| |
| |2024-04-02 09:00:07| 3| 2024-03-28 09:00:07| |
| +-------------------+--------+-------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "timestamp_add", |
| _enum_to_value(unit), |
| _to_java_column(quantity), |
| _to_java_column(ts), |
| ) |
| |
| |
| @_try_remote_functions |
| def window( |
| timeColumn: "ColumnOrName", |
| windowDuration: str, |
| slideDuration: Optional[str] = None, |
| startTime: Optional[str] = None, |
| ) -> Column: |
| """Bucketize rows into one or more time windows given a timestamp specifying column. Window |
| starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window |
| [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in |
| the order of months are not supported. |
| |
| The time column must be of :class:`pyspark.sql.types.TimestampType`. |
| |
| Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid |
| interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. |
| If the ``slideDuration`` is not provided, the windows will be tumbling windows. |
| |
| The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start |
| window intervals. For example, in order to have hourly tumbling windows that start 15 minutes |
| past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`. |
| |
| The output column will be a struct called 'window' by default with the nested columns 'start' |
| and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| timeColumn : :class:`~pyspark.sql.Column` or column name |
| The column or the expression to use as the timestamp for windowing by time. |
| The time column must be of TimestampType or TimestampNTZType. |
| windowDuration : literal string |
| A string specifying the width of the window, e.g. `10 minutes`, |
| `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for |
| valid duration identifiers. Note that the duration is a fixed length of |
| time, and does not vary over time according to a calendar. For example, |
| `1 day` always means 86,400,000 milliseconds, not a calendar day. |
| slideDuration : literal string, optional |
| A new window will be generated every `slideDuration`. Must be less than |
| or equal to the `windowDuration`. Check |
| `org.apache.spark.unsafe.types.CalendarInterval` for valid duration |
| identifiers. This duration is likewise absolute, and does not vary |
| according to a calendar. |
| startTime : literal string, optional |
| The offset with respect to 1970-01-01 00:00:00 UTC with which to start |
| window intervals. For example, in order to have hourly tumbling windows that |
| start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide |
| `startTime` as `15 minutes`. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.window_time` |
| :meth:`pyspark.sql.functions.session_window` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ['dt', 'v']) |
| >>> df2 = df.groupBy(sf.window('dt', '5 seconds')).agg(sf.sum('v')) |
| >>> df2.show(truncate=False) |
| +------------------------------------------+------+ |
| |window |sum(v)| |
| +------------------------------------------+------+ |
| |{2016-03-11 09:00:05, 2016-03-11 09:00:10}|1 | |
| +------------------------------------------+------+ |
| |
| >>> df2.printSchema() |
| root |
| |-- window: struct (nullable = false) |
| | |-- start: timestamp (nullable = true) |
| | |-- end: timestamp (nullable = true) |
| |-- sum(v): long (nullable = true) |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| def check_string_field(field, fieldName): # type: ignore[no-untyped-def] |
| if not field or type(field) is not str: |
| raise PySparkTypeError( |
| errorClass="NOT_STR", |
| messageParameters={"arg_name": fieldName, "arg_type": type(field).__name__}, |
| ) |
| |
| windowDuration = _enum_to_value(windowDuration) |
| slideDuration = _enum_to_value(slideDuration) |
| startTime = _enum_to_value(startTime) |
| |
| time_col = _to_java_column(timeColumn) |
| check_string_field(windowDuration, "windowDuration") |
| if slideDuration and startTime: |
| check_string_field(slideDuration, "slideDuration") |
| check_string_field(startTime, "startTime") |
| return _invoke_function("window", time_col, windowDuration, slideDuration, startTime) |
| elif slideDuration: |
| check_string_field(slideDuration, "slideDuration") |
| return _invoke_function("window", time_col, windowDuration, slideDuration) |
| elif startTime: |
| check_string_field(startTime, "startTime") |
| return _invoke_function("window", time_col, windowDuration, windowDuration, startTime) |
| else: |
| return _invoke_function("window", time_col, windowDuration) |
| |
| |
| @_try_remote_functions |
| def window_time( |
| windowColumn: "ColumnOrName", |
| ) -> Column: |
| """Computes the event time from a window column. The column window values are produced |
| by window aggregating operators and are of type `STRUCT<start: TIMESTAMP, end: TIMESTAMP>` |
| where start is inclusive and end is exclusive. The event time of records produced by window |
| aggregating operators can be computed as ``window_time(window)`` and are |
| ``window.end - lit(1).alias("microsecond")`` (as microsecond is the minimal supported event |
| time precision). The window column must be one produced by a window aggregating operator. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| windowColumn : :class:`~pyspark.sql.Column` or column name |
| The window column of a window aggregate records. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.window` |
| :meth:`pyspark.sql.functions.session_window` |
| |
| Examples |
| -------- |
| >>> import datetime |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)], ['dt', 'v']) |
| |
| Group the data into 5 second time windows and aggregate as sum. |
| |
| >>> df2 = df.groupBy(sf.window('dt', '5 seconds')).agg(sf.sum('v')) |
| |
| Extract the window event time using the window_time function. |
| |
| >>> df2.select('*', sf.window_time('window')).show(truncate=False) |
| +------------------------------------------+------+--------------------------+ |
| |window |sum(v)|window_time(window) | |
| +------------------------------------------+------+--------------------------+ |
| |{2016-03-11 09:00:05, 2016-03-11 09:00:10}|1 |2016-03-11 09:00:09.999999| |
| +------------------------------------------+------+--------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| window_col = _to_java_column(windowColumn) |
| return _invoke_function("window_time", window_col) |
| |
| |
| @_try_remote_functions |
| def session_window(timeColumn: "ColumnOrName", gapDuration: Union[Column, str]) -> Column: |
| """ |
| Generates session window given a timestamp specifying column. |
| Session window is one of dynamic windows, which means the length of window is varying |
| according to the given inputs. The length of session window is defined as "the timestamp |
| of latest input of the session + gap duration", so when the new inputs are bound to the |
| current session window, the end time of session window can be expanded according to the new |
| inputs. |
| Windows can support microsecond precision. Windows in the order of months are not supported. |
| For a streaming query, you may use the function `current_timestamp` to generate windows on |
| processing time. |
| gapDuration is provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid |
| interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. |
| It could also be a Column which can be evaluated to gap duration dynamically based on the |
| input row. |
| The output column will be a struct called 'session_window' by default with the nested columns |
| 'start' and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| timeColumn : :class:`~pyspark.sql.Column` or column name |
| The column name or column to use as the timestamp for windowing by time. |
| The time column must be of TimestampType or TimestampNTZType. |
| gapDuration : :class:`~pyspark.sql.Column` or literal string |
| A Python string literal or column specifying the timeout of the session. It could be |
| static value, e.g. `10 minutes`, `1 second`, or an expression/UDF that specifies gap |
| duration dynamically based on the input row. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.window` |
| :meth:`pyspark.sql.functions.window_time` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('2016-03-11 09:00:07', 1)], ['dt', 'v']) |
| >>> df2 = df.groupBy(sf.session_window('dt', '5 seconds')).agg(sf.sum('v')) |
| >>> df2.show(truncate=False) |
| +------------------------------------------+------+ |
| |session_window |sum(v)| |
| +------------------------------------------+------+ |
| |{2016-03-11 09:00:07, 2016-03-11 09:00:12}|1 | |
| +------------------------------------------+------+ |
| |
| >>> df2.printSchema() |
| root |
| |-- session_window: struct (nullable = false) |
| | |-- start: timestamp (nullable = true) |
| | |-- end: timestamp (nullable = true) |
| |-- sum(v): long (nullable = true) |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| def check_field(field: Union[Column, str], fieldName: str) -> None: |
| if field is None or not isinstance(field, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": fieldName, "arg_type": type(field).__name__}, |
| ) |
| |
| time_col = _to_java_column(timeColumn) |
| gapDuration = _enum_to_value(gapDuration) |
| check_field(gapDuration, "gapDuration") |
| gap_duration = gapDuration if isinstance(gapDuration, str) else _to_java_column(gapDuration) |
| return _invoke_function("session_window", time_col, gap_duration) |
| |
| |
| @_try_remote_functions |
| def to_unix_timestamp( |
| timestamp: "ColumnOrName", |
| format: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Returns the UNIX timestamp of the given time. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or column name, optional |
| format to use to convert UNIX timestamp values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_date` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Using default format to parse the timestamp string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) |
| >>> df.select('*', sf.to_unix_timestamp('ts')).show() |
| +-------------------+------------------------------------------+ |
| | ts|to_unix_timestamp(ts, yyyy-MM-dd HH:mm:ss)| |
| +-------------------+------------------------------------------+ |
| |2015-04-08 12:12:12| 1428520332| |
| +-------------------+------------------------------------------+ |
| |
| Example 2: Using user-specified format 'yyyy-MM-dd' to parse the date string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) |
| >>> df.select('*', sf.to_unix_timestamp(df.dt, sf.lit('yyyy-MM-dd'))).show() |
| +----------+---------------------------------+ |
| | dt|to_unix_timestamp(dt, yyyy-MM-dd)| |
| +----------+---------------------------------+ |
| |2015-04-08| 1428476400| |
| +----------+---------------------------------+ |
| |
| Example 3: Using a format column to represent different formats. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame( |
| ... [('2015-04-08', 'yyyy-MM-dd'), ('2025+01+09', 'yyyy+MM+dd')], ['dt', 'fmt']) |
| >>> df.select('*', sf.to_unix_timestamp('dt', 'fmt')).show() |
| +----------+----------+--------------------------+ |
| | dt| fmt|to_unix_timestamp(dt, fmt)| |
| +----------+----------+--------------------------+ |
| |2015-04-08|yyyy-MM-dd| 1428476400| |
| |2025+01+09|yyyy+MM+dd| 1736409600| |
| +----------+----------+--------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if format is not None: |
| return _invoke_function_over_columns("to_unix_timestamp", timestamp, format) |
| else: |
| return _invoke_function_over_columns("to_unix_timestamp", timestamp) |
| |
| |
| @_try_remote_functions |
| def to_timestamp_ltz( |
| timestamp: "ColumnOrName", |
| format: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Parses the `timestamp` with the `format` to a timestamp with time zone. |
| Returns null with invalid input. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or column name, optional |
| format to use to convert type `TimestampType` timestamp values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_date` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ntz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.to_unix_timestamp` |
| :meth:`pyspark.sql.functions.date_format` |
| |
| Examples |
| -------- |
| Example 1: Using default format to parse the timestamp string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) |
| >>> df.select('*', sf.to_timestamp_ltz('ts')).show() |
| +-------------------+--------------------+ |
| | ts|to_timestamp_ltz(ts)| |
| +-------------------+--------------------+ |
| |2015-04-08 12:12:12| 2015-04-08 12:12:12| |
| +-------------------+--------------------+ |
| |
| Example 2: Using user-specified format to parse the date string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2016-12-31',)], ['dt']) |
| >>> df.select('*', sf.to_timestamp_ltz(df.dt, sf.lit('yyyy-MM-dd'))).show() |
| +----------+--------------------------------+ |
| | dt|to_timestamp_ltz(dt, yyyy-MM-dd)| |
| +----------+--------------------------------+ |
| |2016-12-31| 2016-12-31 00:00:00| |
| +----------+--------------------------------+ |
| |
| Example 3: Using a format column to represent different formats. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame( |
| ... [('2015-04-08', 'yyyy-MM-dd'), ('2025+01+09', 'yyyy+MM+dd')], ['dt', 'fmt']) |
| >>> df.select('*', sf.to_timestamp_ltz('dt', 'fmt')).show() |
| +----------+----------+-------------------------+ |
| | dt| fmt|to_timestamp_ltz(dt, fmt)| |
| +----------+----------+-------------------------+ |
| |2015-04-08|yyyy-MM-dd| 2015-04-08 00:00:00| |
| |2025+01+09|yyyy+MM+dd| 2025-01-09 00:00:00| |
| +----------+----------+-------------------------+ |
| """ |
| if format is not None: |
| return _invoke_function_over_columns("to_timestamp_ltz", timestamp, format) |
| else: |
| return _invoke_function_over_columns("to_timestamp_ltz", timestamp) |
| |
| |
| @_try_remote_functions |
| def to_timestamp_ntz( |
| timestamp: "ColumnOrName", |
| format: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Parses the `timestamp` with the `format` to a timestamp without time zone. |
| Returns null with invalid input. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| timestamp : :class:`~pyspark.sql.Column` or column name |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or column name, optional |
| format to use to convert type `TimestampNTZType` timestamp values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.to_date` |
| :meth:`pyspark.sql.functions.to_timestamp` |
| :meth:`pyspark.sql.functions.to_timestamp_ltz` |
| :meth:`pyspark.sql.functions.to_utc_timestamp` |
| :meth:`pyspark.sql.functions.to_unix_timestamp` |
| :meth:`pyspark.sql.functions.date_format` |
| |
| Examples |
| -------- |
| Example 1: Using default format to parse the timestamp string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 12:12:12',)], ['ts']) |
| >>> df.select('*', sf.to_timestamp_ntz('ts')).show() |
| +-------------------+--------------------+ |
| | ts|to_timestamp_ntz(ts)| |
| +-------------------+--------------------+ |
| |2015-04-08 12:12:12| 2015-04-08 12:12:12| |
| +-------------------+--------------------+ |
| |
| Example 2: Using user-specified format 'yyyy-MM-dd' to parse the date string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2016-12-31',)], ['dt']) |
| >>> df.select('*', sf.to_timestamp_ntz(df.dt, sf.lit('yyyy-MM-dd'))).show() |
| +----------+--------------------------------+ |
| | dt|to_timestamp_ntz(dt, yyyy-MM-dd)| |
| +----------+--------------------------------+ |
| |2016-12-31| 2016-12-31 00:00:00| |
| +----------+--------------------------------+ |
| |
| Example 3: Using a format column to represent different formats. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame( |
| ... [('2015-04-08', 'yyyy-MM-dd'), ('2025+01+09', 'yyyy+MM+dd')], ['dt', 'fmt']) |
| >>> df.select('*', sf.to_timestamp_ntz('dt', 'fmt')).show() |
| +----------+----------+-------------------------+ |
| | dt| fmt|to_timestamp_ntz(dt, fmt)| |
| +----------+----------+-------------------------+ |
| |2015-04-08|yyyy-MM-dd| 2015-04-08 00:00:00| |
| |2025+01+09|yyyy+MM+dd| 2025-01-09 00:00:00| |
| +----------+----------+-------------------------+ |
| """ |
| if format is not None: |
| return _invoke_function_over_columns("to_timestamp_ntz", timestamp, format) |
| else: |
| return _invoke_function_over_columns("to_timestamp_ntz", timestamp) |
| |
| |
| # ---------------------------- misc functions ---------------------------------- |
| |
| |
| @_try_remote_functions |
| def current_catalog() -> Column: |
| """Returns the current catalog. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.current_database` |
| :meth:`pyspark.sql.functions.current_schema` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.current_catalog()).show() |
| +-----------------+ |
| |current_catalog()| |
| +-----------------+ |
| | spark_catalog| |
| +-----------------+ |
| """ |
| return _invoke_function("current_catalog") |
| |
| |
| @_try_remote_functions |
| def current_database() -> Column: |
| """Returns the current database. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.current_catalog` |
| :meth:`pyspark.sql.functions.current_schema` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.current_database()).show() |
| +----------------+ |
| |current_schema()| |
| +----------------+ |
| | default| |
| +----------------+ |
| """ |
| return _invoke_function("current_database") |
| |
| |
| @_try_remote_functions |
| def current_schema() -> Column: |
| """Returns the current database. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.current_catalog` |
| :meth:`pyspark.sql.functions.current_database` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.current_schema()).show() |
| +----------------+ |
| |current_schema()| |
| +----------------+ |
| | default| |
| +----------------+ |
| """ |
| return _invoke_function("current_schema") |
| |
| |
| @_try_remote_functions |
| def current_user() -> Column: |
| """Returns the current database. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.user` |
| :meth:`pyspark.sql.functions.session_user` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.current_user()).show() # doctest: +SKIP |
| +--------------+ |
| |current_user()| |
| +--------------+ |
| | ruifeng.zheng| |
| +--------------+ |
| """ |
| return _invoke_function("current_user") |
| |
| |
| @_try_remote_functions |
| def user() -> Column: |
| """Returns the current database. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.current_user` |
| :meth:`pyspark.sql.functions.session_user` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.user()).show() # doctest: +SKIP |
| +--------------+ |
| | user()| |
| +--------------+ |
| | ruifeng.zheng| |
| +--------------+ |
| """ |
| return _invoke_function("user") |
| |
| |
| @_try_remote_functions |
| def session_user() -> Column: |
| """Returns the user name of current execution context. |
| |
| .. versionadded:: 4.0.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.user` |
| :meth:`pyspark.sql.functions.current_user` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.session_user()).show() # doctest: +SKIP |
| +--------------+ |
| |session_user()| |
| +--------------+ |
| | ruifeng.zheng| |
| +--------------+ |
| """ |
| return _invoke_function("session_user") |
| |
| |
| @_try_remote_functions |
| def uuid(seed: Optional[Union[Column, int]] = None) -> Column: |
| """Returns an universally unique identifier (UUID) string. |
| The value is returned as a canonical UUID 36-character string. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| seed : :class:`~pyspark.sql.Column` or int |
| Optional random number seed to use. |
| |
| Examples |
| -------- |
| Example 1: Generate UUIDs with random seed |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(5).select(sf.uuid()).show(truncate=False) # doctest: +SKIP |
| +------------------------------------+ |
| |uuid() | |
| +------------------------------------+ |
| |627ae05e-b319-42b5-b4e4-71c8c9754dd1| |
| |f781cce5-a2e2-464d-bc8b-426ff448e404| |
| |15e2e66e-8416-4ea2-af3c-409363408189| |
| |fb1d6178-7676-4791-baa9-f2ddcc494515| |
| |d48665e8-2657-4c6b-b7c8-8ae0cd646e41| |
| +------------------------------------+ |
| |
| Example 2: Generate UUIDs with a specified seed |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(0, 5, 1, 1).select(sf.uuid(seed=123)).show(truncate=False) |
| +------------------------------------+ |
| |uuid() | |
| +------------------------------------+ |
| |4c99192d-23d6-4d88-b814-a634398120f0| |
| |af506873-3c53-41e3-8354-a24856b8de8a| |
| |7b4b370e-e867-47e2-93c0-f6990463a12d| |
| |1c4d1733-ff1a-4a6c-b144-0b0345adf0d0| |
| |7478f235-f8bc-4112-8e59-a28f50e46890| |
| +------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if seed is None: |
| return _invoke_function("uuid") |
| else: |
| return _invoke_function("uuid", _to_java_column(lit(seed))) |
| |
| |
| @_try_remote_functions |
| def crc32(col: "ColumnOrName") -> Column: |
| """ |
| Calculates the cyclic redundancy check value (CRC32) of a binary column and |
| returns the value as a bigint. |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| .. versionadded:: 1.5.0 |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ABC',)], ['a']) |
| >>> df.select('*', sf.crc32('a')).show(truncate=False) |
| +---+----------+ |
| |a |crc32(a) | |
| +---+----------+ |
| |ABC|2743272264| |
| +---+----------+ |
| """ |
| return _invoke_function_over_columns("crc32", col) |
| |
| |
| @_try_remote_functions |
| def md5(col: "ColumnOrName") -> Column: |
| """Calculates the MD5 digest and returns the value as a 32 character hex string. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ABC',)], ['a']) |
| >>> df.select('*', sf.md5('a')).show(truncate=False) |
| +---+--------------------------------+ |
| |a |md5(a) | |
| +---+--------------------------------+ |
| |ABC|902fbdd2b1df0c4f70b4a5d23525e932| |
| +---+--------------------------------+ |
| """ |
| return _invoke_function_over_columns("md5", col) |
| |
| |
| @_try_remote_functions |
| def sha1(col: "ColumnOrName") -> Column: |
| """Returns the hex string result of SHA-1. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.sha` |
| :meth:`pyspark.sql.functions.sha2` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ABC',)], ['a']) |
| >>> df.select('*', sf.sha1('a')).show(truncate=False) |
| +---+----------------------------------------+ |
| |a |sha1(a) | |
| +---+----------------------------------------+ |
| |ABC|3c01bdbb26f358bab27f267924aa2c9a03fcfdb8| |
| +---+----------------------------------------+ |
| """ |
| return _invoke_function_over_columns("sha1", col) |
| |
| |
| @_try_remote_functions |
| def sha2(col: "ColumnOrName", numBits: int) -> Column: |
| """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384, |
| and SHA-512). The numBits indicates the desired bit length of the result, which must have a |
| value of 224, 256, 384, 512, or 0 (which is equivalent to 256). |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| numBits : int |
| the desired bit length of the result, which must have a |
| value of 224, 256, 384, 512, or 0 (which is equivalent to 256). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.sha` |
| :meth:`pyspark.sql.functions.sha1` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([['Alice'], ['Bob']], ['name']) |
| >>> df.select('*', sf.sha2('name', 256)).show(truncate=False) |
| +-----+----------------------------------------------------------------+ |
| |name |sha2(name, 256) | |
| +-----+----------------------------------------------------------------+ |
| |Alice|3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043| |
| |Bob |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961| |
| +-----+----------------------------------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if numBits not in [0, 224, 256, 384, 512]: |
| raise PySparkValueError( |
| errorClass="VALUE_NOT_ALLOWED", |
| messageParameters={ |
| "arg_name": "numBits", |
| "allowed_values": "[0, 224, 256, 384, 512]", |
| }, |
| ) |
| return _invoke_function("sha2", _to_java_column(col), numBits) |
| |
| |
| @_try_remote_functions |
| def hash(*cols: "ColumnOrName") -> Column: |
| """Calculates the hash code of given columns, and returns the result as an int column. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| one or more columns to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hash value as int column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.xxhash64` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) |
| >>> df.select('*', sf.hash('c1')).show() |
| +---+---+----------+ |
| | c1| c2| hash(c1)| |
| +---+---+----------+ |
| |ABC|DEF|-757602832| |
| +---+---+----------+ |
| |
| >>> df.select('*', sf.hash('c1', df.c2)).show() |
| +---+---+------------+ |
| | c1| c2|hash(c1, c2)| |
| +---+---+------------+ |
| |ABC|DEF| 599895104| |
| +---+---+------------+ |
| |
| >>> df.select('*', sf.hash('*')).show() |
| +---+---+------------+ |
| | c1| c2|hash(c1, c2)| |
| +---+---+------------+ |
| |ABC|DEF| 599895104| |
| +---+---+------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("hash", cols) |
| |
| |
| @_try_remote_functions |
| def xxhash64(*cols: "ColumnOrName") -> Column: |
| """Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm, |
| and returns the result as a long column. The hash computation uses an initial seed of 42. |
| |
| .. versionadded:: 3.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| one or more columns to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hash value as long column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.hash` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2']) |
| >>> df.select('*', sf.xxhash64('c1')).show() |
| +---+---+-------------------+ |
| | c1| c2| xxhash64(c1)| |
| +---+---+-------------------+ |
| |ABC|DEF|4105715581806190027| |
| +---+---+-------------------+ |
| |
| >>> df.select('*', sf.xxhash64('c1', df.c2)).show() |
| +---+---+-------------------+ |
| | c1| c2| xxhash64(c1, c2)| |
| +---+---+-------------------+ |
| |ABC|DEF|3233247871021311208| |
| +---+---+-------------------+ |
| |
| >>> df.select('*', sf.xxhash64('*')).show() |
| +---+---+-------------------+ |
| | c1| c2| xxhash64(c1, c2)| |
| +---+---+-------------------+ |
| |ABC|DEF|3233247871021311208| |
| +---+---+-------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("xxhash64", cols) |
| |
| |
| @_try_remote_functions |
| def assert_true(col: "ColumnOrName", errMsg: Optional[Union[Column, str]] = None) -> Column: |
| """ |
| Returns `null` if the input column is `true`; throws an exception |
| with the provided error message otherwise. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| column name or column that represents the input column to test |
| errMsg : :class:`~pyspark.sql.Column` or literal string, optional |
| A Python string literal or column containing the error message |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| `null` if the input column is `true` otherwise throws an error with specified message. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.raise_error` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(0, 1)], ['a', 'b']) |
| >>> df.select('*', sf.assert_true(df.a < df.b)).show() # doctest: +SKIP |
| +------------------------------------------------------+ |
| |assert_true((a < b), '(a#788L < b#789L)' is not true!)| |
| +------------------------------------------------------+ |
| | NULL| |
| +------------------------------------------------------+ |
| |
| >>> df.select('*', sf.assert_true(df.a < df.b, df.a)).show() |
| +---+---+-----------------------+ |
| | a| b|assert_true((a < b), a)| |
| +---+---+-----------------------+ |
| | 0| 1| NULL| |
| +---+---+-----------------------+ |
| |
| >>> df.select('*', sf.assert_true(df.a < df.b, 'error')).show() |
| +---+---+---------------------------+ |
| | a| b|assert_true((a < b), error)| |
| +---+---+---------------------------+ |
| | 0| 1| NULL| |
| +---+---+---------------------------+ |
| |
| >>> df.select('*', sf.assert_true(df.a > df.b, 'My error msg')).show() # doctest: +SKIP |
| ... |
| java.lang.RuntimeException: My error msg |
| ... |
| """ |
| errMsg = _enum_to_value(errMsg) |
| if errMsg is None: |
| return _invoke_function_over_columns("assert_true", col) |
| if not isinstance(errMsg, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": "errMsg", "arg_type": type(errMsg).__name__}, |
| ) |
| return _invoke_function_over_columns("assert_true", col, lit(errMsg)) |
| |
| |
| @_try_remote_functions |
| def raise_error(errMsg: Union[Column, str]) -> Column: |
| """ |
| Throws an exception with the provided error message. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| errMsg : :class:`~pyspark.sql.Column` or literal string |
| A Python string literal or column containing the error message |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| throws an error with specified message. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.assert_true` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.raise_error("My error message")).show() # doctest: +SKIP |
| ... |
| java.lang.RuntimeException: My error message |
| ... |
| """ |
| errMsg = _enum_to_value(errMsg) |
| if not isinstance(errMsg, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": "errMsg", "arg_type": type(errMsg).__name__}, |
| ) |
| return _invoke_function_over_columns("raise_error", lit(errMsg)) |
| |
| |
| # ---------------------- String/Binary functions ------------------------------ |
| |
| |
| @_try_remote_functions |
| def upper(col: "ColumnOrName") -> Column: |
| """ |
| Converts a string expression to upper case. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| upper case values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.lower` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") |
| >>> df.select("*", sf.upper("value")).show() |
| +----------+------------+ |
| | value|upper(value)| |
| +----------+------------+ |
| | Spark| SPARK| |
| | PySpark| PYSPARK| |
| |Pandas API| PANDAS API| |
| +----------+------------+ |
| """ |
| return _invoke_function_over_columns("upper", col) |
| |
| |
| @_try_remote_functions |
| def lower(col: "ColumnOrName") -> Column: |
| """ |
| Converts a string expression to lower case. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| lower case values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.upper` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") |
| >>> df.select("*", sf.lower("value")).show() |
| +----------+------------+ |
| | value|lower(value)| |
| +----------+------------+ |
| | Spark| spark| |
| | PySpark| pyspark| |
| |Pandas API| pandas api| |
| +----------+------------+ |
| """ |
| return _invoke_function_over_columns("lower", col) |
| |
| |
| @_try_remote_functions |
| def ascii(col: "ColumnOrName") -> Column: |
| """ |
| Computes the numeric value of the first character of the string column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| numeric value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") |
| >>> df.select("*", sf.ascii("value")).show() |
| +----------+------------+ |
| | value|ascii(value)| |
| +----------+------------+ |
| | Spark| 83| |
| | PySpark| 80| |
| |Pandas API| 80| |
| +----------+------------+ |
| """ |
| return _invoke_function_over_columns("ascii", col) |
| |
| |
| @_try_remote_functions |
| def base64(col: "ColumnOrName") -> Column: |
| """ |
| Computes the BASE64 encoding of a binary column and returns it as a string column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| BASE64 encoding of string value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.unbase64` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING") |
| >>> df.select("*", sf.base64("value")).show() |
| +----------+----------------+ |
| | value| base64(value)| |
| +----------+----------------+ |
| | Spark| U3Bhcms=| |
| | PySpark| UHlTcGFyaw==| |
| |Pandas API|UGFuZGFzIEFQSQ==| |
| +----------+----------------+ |
| """ |
| return _invoke_function_over_columns("base64", col) |
| |
| |
| @_try_remote_functions |
| def unbase64(col: "ColumnOrName") -> Column: |
| """ |
| Decodes a BASE64 encoded string column and returns it as a binary column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| encoded string value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.base64` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["U3Bhcms=", "UHlTcGFyaw==", "UGFuZGFzIEFQSQ=="], "STRING") |
| >>> df.select("*", sf.unbase64("value")).show(truncate=False) |
| +----------------+-------------------------------+ |
| |value |unbase64(value) | |
| +----------------+-------------------------------+ |
| |U3Bhcms= |[53 70 61 72 6B] | |
| |UHlTcGFyaw== |[50 79 53 70 61 72 6B] | |
| |UGFuZGFzIEFQSQ==|[50 61 6E 64 61 73 20 41 50 49]| |
| +----------------+-------------------------------+ |
| """ |
| return _invoke_function_over_columns("unbase64", col) |
| |
| |
| @_try_remote_functions |
| def ltrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| Trim the spaces from left end for the specified string value. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| trim : :class:`~pyspark.sql.Column` or column name, optional |
| The trim string characters to trim, the default value is a single space |
| |
| .. versionadded:: 4.0.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| left trimmed values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.trim` |
| :meth:`pyspark.sql.functions.rtrim` |
| |
| Examples |
| -------- |
| Example 1: Trim the spaces |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") |
| >>> df.select("*", sf.ltrim("value")).show() |
| +--------+------------+ |
| | value|ltrim(value)| |
| +--------+------------+ |
| | Spark| Spark| |
| | Spark | Spark | |
| | Spark| Spark| |
| +--------+------------+ |
| |
| Example 2: Trim specified characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["***Spark", "Spark**", "*Spark"], "STRING") |
| >>> df.select("*", sf.ltrim("value", sf.lit("*"))).show() |
| +--------+--------------------------+ |
| | value|TRIM(LEADING * FROM value)| |
| +--------+--------------------------+ |
| |***Spark| Spark| |
| | Spark**| Spark**| |
| | *Spark| Spark| |
| +--------+--------------------------+ |
| |
| Example 3: Trim a column containing different characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("**Spark*", "*"), ("==Spark=", "=")], ["value", "t"]) |
| >>> df.select("*", sf.ltrim("value", "t")).show() |
| +--------+---+--------------------------+ |
| | value| t|TRIM(LEADING t FROM value)| |
| +--------+---+--------------------------+ |
| |**Spark*| *| Spark*| |
| |==Spark=| =| Spark=| |
| +--------+---+--------------------------+ |
| """ |
| if trim is not None: |
| return _invoke_function_over_columns("ltrim", col, trim) |
| else: |
| return _invoke_function_over_columns("ltrim", col) |
| |
| |
| @_try_remote_functions |
| def rtrim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| Trim the spaces from right end for the specified string value. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| trim : :class:`~pyspark.sql.Column` or column name, optional |
| The trim string characters to trim, the default value is a single space |
| |
| .. versionadded:: 4.0.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| right trimmed values. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.trim` |
| :meth:`pyspark.sql.functions.ltrim` |
| |
| Examples |
| -------- |
| Example 1: Trim the spaces |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") |
| >>> df.select("*", sf.rtrim("value")).show() |
| +--------+------------+ |
| | value|rtrim(value)| |
| +--------+------------+ |
| | Spark| Spark| |
| | Spark | Spark| |
| | Spark| Spark| |
| +--------+------------+ |
| |
| Example 2: Trim specified characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["***Spark", "Spark**", "*Spark"], "STRING") |
| >>> df.select("*", sf.rtrim("value", sf.lit("*"))).show() |
| +--------+---------------------------+ |
| | value|TRIM(TRAILING * FROM value)| |
| +--------+---------------------------+ |
| |***Spark| ***Spark| |
| | Spark**| Spark| |
| | *Spark| *Spark| |
| +--------+---------------------------+ |
| |
| Example 3: Trim a column containing different characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("**Spark*", "*"), ("==Spark=", "=")], ["value", "t"]) |
| >>> df.select("*", sf.rtrim("value", "t")).show() |
| +--------+---+---------------------------+ |
| | value| t|TRIM(TRAILING t FROM value)| |
| +--------+---+---------------------------+ |
| |**Spark*| *| **Spark| |
| |==Spark=| =| ==Spark| |
| +--------+---+---------------------------+ |
| """ |
| if trim is not None: |
| return _invoke_function_over_columns("rtrim", col, trim) |
| else: |
| return _invoke_function_over_columns("rtrim", col) |
| |
| |
| @_try_remote_functions |
| def trim(col: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| Trim the spaces from both ends for the specified string column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| trim : :class:`~pyspark.sql.Column` or column name, optional |
| The trim string characters to trim, the default value is a single space |
| |
| .. versionadded:: 4.0.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| trimmed values from both sides. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.ltrim` |
| :meth:`pyspark.sql.functions.rtrim` |
| |
| Examples |
| -------- |
| Example 1: Trim the spaces |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING") |
| >>> df.select("*", sf.trim("value")).show() |
| +--------+-----------+ |
| | value|trim(value)| |
| +--------+-----------+ |
| | Spark| Spark| |
| | Spark | Spark| |
| | Spark| Spark| |
| +--------+-----------+ |
| |
| Example 2: Trim specified characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["***Spark", "Spark**", "*Spark"], "STRING") |
| >>> df.select("*", sf.trim("value", sf.lit("*"))).show() |
| +--------+-----------------------+ |
| | value|TRIM(BOTH * FROM value)| |
| +--------+-----------------------+ |
| |***Spark| Spark| |
| | Spark**| Spark| |
| | *Spark| Spark| |
| +--------+-----------------------+ |
| |
| Example 3: Trim a column containing different characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("**Spark*", "*"), ("==Spark=", "=")], ["value", "t"]) |
| >>> df.select("*", sf.trim("value", "t")).show() |
| +--------+---+-----------------------+ |
| | value| t|TRIM(BOTH t FROM value)| |
| +--------+---+-----------------------+ |
| |**Spark*| *| Spark| |
| |==Spark=| =| Spark| |
| +--------+---+-----------------------+ |
| """ |
| if trim is not None: |
| return _invoke_function_over_columns("trim", col, trim) |
| else: |
| return _invoke_function_over_columns("trim", col) |
| |
| |
| @_try_remote_functions |
| def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: |
| """ |
| Concatenates multiple input string columns together into a single string column, |
| using the given separator. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| sep : literal string |
| words separator. |
| cols : :class:`~pyspark.sql.Column` or column name |
| list of columns to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string of concatenated words. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.concat` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("abcd", "123")], ["s", "d"]) |
| >>> df.select("*", sf.concat_ws("-", df.s, "d", sf.lit("xyz"))).show() |
| +----+---+-----------------------+ |
| | s| d|concat_ws(-, s, d, xyz)| |
| +----+---+-----------------------+ |
| |abcd|123| abcd-123-xyz| |
| +----+---+-----------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function("concat_ws", _enum_to_value(sep), _to_seq(sc, cols, _to_java_column)) |
| |
| |
| @_try_remote_functions |
| def decode(col: "ColumnOrName", charset: str) -> Column: |
| """ |
| Computes the first argument into a string from a binary using the provided character set |
| (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| charset : literal string |
| charset to use to decode to. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.encode` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(b"\x61\x62\x63\x64",)], ["a"]) |
| >>> df.select("*", sf.decode("a", "UTF-8")).show() |
| +-------------+----------------+ |
| | a|decode(a, UTF-8)| |
| +-------------+----------------+ |
| |[61 62 63 64]| abcd| |
| +-------------+----------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("decode", _to_java_column(col), _enum_to_value(charset)) |
| |
| |
| @_try_remote_functions |
| def encode(col: "ColumnOrName", charset: str) -> Column: |
| """ |
| Computes the first argument into a binary from a string using the provided character set |
| (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| charset : literal string |
| charset to use to encode. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column for computed results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.decode` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("abcd",)], ["c"]) |
| >>> df.select("*", sf.encode("c", "UTF-8")).show() |
| +----+----------------+ |
| | c|encode(c, UTF-8)| |
| +----+----------------+ |
| |abcd| [61 62 63 64]| |
| +----+----------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("encode", _to_java_column(col), _enum_to_value(charset)) |
| |
| |
| @_try_remote_functions |
| def is_valid_utf8(str: "ColumnOrName") -> Column: |
| """ |
| Returns true if the input is a valid UTF-8 string, otherwise returns false. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| A column of strings, each representing a UTF-8 byte sequence. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| whether the input string is a valid UTF-8 string. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_valid_utf8` |
| :meth:`pyspark.sql.functions.validate_utf8` |
| :meth:`pyspark.sql.functions.try_validate_utf8` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.is_valid_utf8(sf.lit("SparkSQL"))).show() |
| +-----------------------+ |
| |is_valid_utf8(SparkSQL)| |
| +-----------------------+ |
| | true| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("is_valid_utf8", str) |
| |
| |
| @_try_remote_functions |
| def make_valid_utf8(str: "ColumnOrName") -> Column: |
| """ |
| Returns a new string in which all invalid UTF-8 byte sequences, if any, are replaced by the |
| Unicode replacement character (U+FFFD). |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| A column of strings, each representing a UTF-8 byte sequence. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the valid UTF-8 version of the given input string. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.is_valid_utf8` |
| :meth:`pyspark.sql.functions.validate_utf8` |
| :meth:`pyspark.sql.functions.try_validate_utf8` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.make_valid_utf8(sf.lit("SparkSQL"))).show() |
| +-------------------------+ |
| |make_valid_utf8(SparkSQL)| |
| +-------------------------+ |
| | SparkSQL| |
| +-------------------------+ |
| """ |
| return _invoke_function_over_columns("make_valid_utf8", str) |
| |
| |
| @_try_remote_functions |
| def validate_utf8(str: "ColumnOrName") -> Column: |
| """ |
| Returns the input value if it corresponds to a valid UTF-8 string, or emits an error otherwise. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| A column of strings, each representing a UTF-8 byte sequence. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the input string if it is a valid UTF-8 string, error otherwise. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.is_valid_utf8` |
| :meth:`pyspark.sql.functions.make_valid_utf8` |
| :meth:`pyspark.sql.functions.try_validate_utf8` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.validate_utf8(sf.lit("SparkSQL"))).show() |
| +-----------------------+ |
| |validate_utf8(SparkSQL)| |
| +-----------------------+ |
| | SparkSQL| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("validate_utf8", str) |
| |
| |
| @_try_remote_functions |
| def try_validate_utf8(str: "ColumnOrName") -> Column: |
| """ |
| Returns the input value if it corresponds to a valid UTF-8 string, or NULL otherwise. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| A column of strings, each representing a UTF-8 byte sequence. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the input string if it is a valid UTF-8 string, null otherwise. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.is_valid_utf8` |
| :meth:`pyspark.sql.functions.make_valid_utf8` |
| :meth:`pyspark.sql.functions.validate_utf8` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.try_validate_utf8(sf.lit("SparkSQL"))).show() |
| +---------------------------+ |
| |try_validate_utf8(SparkSQL)| |
| +---------------------------+ |
| | SparkSQL| |
| +---------------------------+ |
| """ |
| return _invoke_function_over_columns("try_validate_utf8", str) |
| |
| |
| @_try_remote_functions |
| def format_number(col: "ColumnOrName", d: int) -> Column: |
| """ |
| Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places |
| with HALF_EVEN round mode, and returns the result as a string. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| the column name of the numeric value to be formatted |
| d : int |
| the N decimal places |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column of formatted results. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(5,)], ["a"]) |
| >>> df.select("*", sf.format_number("a", 4), sf.format_number(df.a, 6)).show() |
| +---+-------------------+-------------------+ |
| | a|format_number(a, 4)|format_number(a, 6)| |
| +---+-------------------+-------------------+ |
| | 5| 5.0000| 5.000000| |
| +---+-------------------+-------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("format_number", _to_java_column(col), _enum_to_value(d)) |
| |
| |
| @_try_remote_functions |
| def format_string(format: str, *cols: "ColumnOrName") -> Column: |
| """ |
| Formats the arguments in printf-style and returns the result as a string column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| format : literal string |
| string that can contain embedded format tags and used as result column's value |
| cols : :class:`~pyspark.sql.Column` or column name |
| column names or :class:`~pyspark.sql.Column`\\s to be used in formatting |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the column of formatted results. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.printf` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(5, "hello")], ["a", "b"]) |
| >>> df.select("*", sf.format_string('%d %s', "a", df.b)).show() |
| +---+-----+--------------------------+ |
| | a| b|format_string(%d %s, a, b)| |
| +---+-----+--------------------------+ |
| | 5|hello| 5 hello| |
| +---+-----+--------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function( |
| "format_string", _enum_to_value(format), _to_seq(sc, cols, _to_java_column) |
| ) |
| |
| |
| @_try_remote_functions |
| def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column: |
| """ |
| Locate the position of the first occurrence of substr column in the given string. |
| Returns null if either of the arguments are null. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The position is not zero based, but 1 based index. Returns 0 if substr |
| could not be found in str. |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| substr : :class:`~pyspark.sql.Column` or literal string |
| substring to look for. |
| |
| .. versionchanged:: 4.0.0 |
| `substr` now accepts column. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| location of the first occurrence of the substring as integer. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.locate` |
| :meth:`pyspark.sql.functions.substr` |
| :meth:`pyspark.sql.functions.substring` |
| :meth:`pyspark.sql.functions.substring_index` |
| |
| Examples |
| -------- |
| Example 1: Using a literal string as the 'substring' |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",]) |
| >>> df.select("*", sf.instr(df.s, "b")).show() |
| +----+-----------+ |
| | s|instr(s, b)| |
| +----+-----------+ |
| |abcd| 2| |
| | xyz| 0| |
| +----+-----------+ |
| |
| Example 2: Using a Column 'substring' |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",]) |
| >>> df.select("*", sf.instr("s", sf.lit("abc").substr(0, 2))).show() |
| +----+---------------------------+ |
| | s|instr(s, substr(abc, 0, 2))| |
| +----+---------------------------+ |
| |abcd| 1| |
| | xyz| 0| |
| +----+---------------------------+ |
| """ |
| return _invoke_function_over_columns("instr", str, lit(substr)) |
| |
| |
| @_try_remote_functions |
| def overlay( |
| src: "ColumnOrName", |
| replace: "ColumnOrName", |
| pos: Union["ColumnOrName", int], |
| len: Union["ColumnOrName", int] = -1, |
| ) -> Column: |
| """ |
| Overlay the specified portion of `src` with `replace`, |
| starting from byte position `pos` of `src` and proceeding for `len` bytes. |
| |
| .. versionadded:: 3.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| src : :class:`~pyspark.sql.Column` or column name |
| the string that will be replaced |
| replace : :class:`~pyspark.sql.Column` or column name |
| the substitution string |
| pos : :class:`~pyspark.sql.Column` or column name or int |
| the starting position in src |
| len : :class:`~pyspark.sql.Column` or column name or int, optional |
| the number of bytes to replace in src |
| string by 'replace' defaults to -1, which represents the length of the 'replace' string |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string with replaced values. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y")) |
| >>> df.select("*", sf.overlay("x", df.y, 7)).show() |
| +---------+----+--------------------+ |
| | x| y|overlay(x, y, 7, -1)| |
| +---------+----+--------------------+ |
| |SPARK_SQL|CORE| SPARK_CORE| |
| +---------+----+--------------------+ |
| |
| >>> df.select("*", sf.overlay("x", df.y, 7, 0)).show() |
| +---------+----+-------------------+ |
| | x| y|overlay(x, y, 7, 0)| |
| +---------+----+-------------------+ |
| |SPARK_SQL|CORE| SPARK_CORESQL| |
| +---------+----+-------------------+ |
| |
| >>> df.select("*", sf.overlay("x", "y", 7, 2)).show() |
| +---------+----+-------------------+ |
| | x| y|overlay(x, y, 7, 2)| |
| +---------+----+-------------------+ |
| |SPARK_SQL|CORE| SPARK_COREL| |
| +---------+----+-------------------+ |
| """ |
| pos = _enum_to_value(pos) |
| if not isinstance(pos, (int, str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_INT_OR_STR", |
| messageParameters={"arg_name": "pos", "arg_type": type(pos).__name__}, |
| ) |
| len = _enum_to_value(len) |
| if len is not None and not isinstance(len, (int, str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_INT_OR_STR", |
| messageParameters={"arg_name": "len", "arg_type": type(len).__name__}, |
| ) |
| |
| if isinstance(pos, int): |
| pos = lit(pos) |
| if isinstance(len, int): |
| len = lit(len) |
| |
| return _invoke_function_over_columns("overlay", src, replace, pos, len) |
| |
| |
| @_try_remote_functions |
| def sentences( |
| string: "ColumnOrName", |
| language: Optional["ColumnOrName"] = None, |
| country: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Splits a string into arrays of sentences, where each sentence is an array of words. |
| The `language` and `country` arguments are optional, |
| When they are omitted: |
| 1.If they are both omitted, the `Locale.ROOT - locale(language='', country='')` is used. |
| The `Locale.ROOT` is regarded as the base locale of all locales, and is used as the |
| language/country neutral locale for the locale sensitive operations. |
| 2.If the `country` is omitted, the `locale(language, country='')` is used. |
| When they are null: |
| 1.If they are both `null`, the `Locale.US - locale(language='en', country='US')` is used. |
| 2.If the `language` is null and the `country` is not null, |
| the `Locale.US - locale(language='en', country='US')` is used. |
| 3.If the `language` is not null and the `country` is null, the `locale(language)` is used. |
| 4.If neither is `null`, the `locale(language, country)` is used. |
| |
| .. versionadded:: 3.2.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. versionchanged:: 4.0.0 |
| Supports `sentences(string, language)`. |
| |
| Parameters |
| ---------- |
| string : :class:`~pyspark.sql.Column` or column name |
| a string to be split |
| language : :class:`~pyspark.sql.Column` or column name, optional |
| a language of the locale |
| country : :class:`~pyspark.sql.Column` or column name, optional |
| a country of the locale |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| arrays of split sentences. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.split` |
| :meth:`pyspark.sql.functions.split_part` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("This is an example sentence.", )], ["s"]) |
| >>> df.select("*", sf.sentences(df.s, sf.lit("en"), sf.lit("US"))).show(truncate=False) |
| +----------------------------+-----------------------------------+ |
| |s |sentences(s, en, US) | |
| +----------------------------+-----------------------------------+ |
| |This is an example sentence.|[[This, is, an, example, sentence]]| |
| +----------------------------+-----------------------------------+ |
| |
| >>> df.select("*", sf.sentences(df.s, sf.lit("en"))).show(truncate=False) |
| +----------------------------+-----------------------------------+ |
| |s |sentences(s, en, ) | |
| +----------------------------+-----------------------------------+ |
| |This is an example sentence.|[[This, is, an, example, sentence]]| |
| +----------------------------+-----------------------------------+ |
| |
| >>> df.select("*", sf.sentences(df.s)).show(truncate=False) |
| +----------------------------+-----------------------------------+ |
| |s |sentences(s, , ) | |
| +----------------------------+-----------------------------------+ |
| |This is an example sentence.|[[This, is, an, example, sentence]]| |
| +----------------------------+-----------------------------------+ |
| """ |
| if language is None: |
| language = lit("") |
| if country is None: |
| country = lit("") |
| |
| return _invoke_function_over_columns("sentences", string, language, country) |
| |
| |
| @_try_remote_functions |
| def substring( |
| str: "ColumnOrName", |
| pos: Union["ColumnOrName", int], |
| len: Union["ColumnOrName", int], |
| ) -> Column: |
| """ |
| Substring starts at `pos` and is of length `len` when str is String type or |
| returns the slice of byte array that starts at `pos` in byte and is of length `len` |
| when str is Binary type. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The position is not zero based, but 1 based index. |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| pos : :class:`~pyspark.sql.Column` or column name or int |
| starting position in str. |
| |
| .. versionchanged:: 4.0.0 |
| `pos` now accepts column and column name. |
| |
| len : :class:`~pyspark.sql.Column` or column name or int |
| length of chars. |
| |
| .. versionchanged:: 4.0.0 |
| `len` now accepts column and column name. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| substring of given value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.instr` |
| :meth:`pyspark.sql.functions.locate` |
| :meth:`pyspark.sql.functions.substr` |
| :meth:`pyspark.sql.functions.substring_index` |
| :meth:`pyspark.sql.Column.substr` |
| |
| Examples |
| -------- |
| Example 1: Using literal integers as arguments |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('abcd',)], ['s',]) |
| >>> df.select('*', sf.substring(df.s, 1, 2)).show() |
| +----+------------------+ |
| | s|substring(s, 1, 2)| |
| +----+------------------+ |
| |abcd| ab| |
| +----+------------------+ |
| |
| Example 2: Using columns as arguments |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l']) |
| >>> df.select('*', sf.substring(df.s, 2, df.l)).show() |
| +-----+---+---+------------------+ |
| | s| p| l|substring(s, 2, l)| |
| +-----+---+---+------------------+ |
| |Spark| 2| 3| par| |
| +-----+---+---+------------------+ |
| |
| >>> df.select('*', sf.substring(df.s, df.p, 3)).show() |
| +-----+---+---+------------------+ |
| | s| p| l|substring(s, p, 3)| |
| +-----+---+---+------------------+ |
| |Spark| 2| 3| par| |
| +-----+---+---+------------------+ |
| |
| >>> df.select('*', sf.substring(df.s, df.p, df.l)).show() |
| +-----+---+---+------------------+ |
| | s| p| l|substring(s, p, l)| |
| +-----+---+---+------------------+ |
| |Spark| 2| 3| par| |
| +-----+---+---+------------------+ |
| |
| Example 3: Using column names as arguments |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l']) |
| >>> df.select('*', sf.substring(df.s, 2, 'l')).show() |
| +-----+---+---+------------------+ |
| | s| p| l|substring(s, 2, l)| |
| +-----+---+---+------------------+ |
| |Spark| 2| 3| par| |
| +-----+---+---+------------------+ |
| |
| >>> df.select('*', sf.substring('s', 'p', 'l')).show() |
| +-----+---+---+------------------+ |
| | s| p| l|substring(s, p, l)| |
| +-----+---+---+------------------+ |
| |Spark| 2| 3| par| |
| +-----+---+---+------------------+ |
| """ |
| pos = _enum_to_value(pos) |
| pos = lit(pos) if isinstance(pos, int) else pos |
| len = _enum_to_value(len) |
| len = lit(len) if isinstance(len, int) else len |
| return _invoke_function_over_columns("substring", str, pos, len) |
| |
| |
| @_try_remote_functions |
| def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: |
| """ |
| Returns the substring from string str before count occurrences of the delimiter delim. |
| If count is positive, everything the left of the final delimiter (counting from left) is |
| returned. If count is negative, every to the right of the final delimiter (counting from the |
| right) is returned. substring_index performs a case-sensitive match when searching for delim. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| delim : literal string |
| delimiter of values. |
| count : int |
| number of occurrences. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| substring of given value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.instr` |
| :meth:`pyspark.sql.functions.locate` |
| :meth:`pyspark.sql.functions.substr` |
| :meth:`pyspark.sql.functions.substring` |
| :meth:`pyspark.sql.Column.substr` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('a.b.c.d',)], ['s']) |
| >>> df.select('*', sf.substring_index(df.s, '.', 2)).show() |
| +-------+------------------------+ |
| | s|substring_index(s, ., 2)| |
| +-------+------------------------+ |
| |a.b.c.d| a.b| |
| +-------+------------------------+ |
| |
| >>> df.select('*', sf.substring_index('s', '.', -3)).show() |
| +-------+-------------------------+ |
| | s|substring_index(s, ., -3)| |
| +-------+-------------------------+ |
| |a.b.c.d| b.c.d| |
| +-------+-------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "substring_index", _to_java_column(str), _enum_to_value(delim), _enum_to_value(count) |
| ) |
| |
| |
| @_try_remote_functions |
| def levenshtein( |
| left: "ColumnOrName", right: "ColumnOrName", threshold: Optional[int] = None |
| ) -> Column: |
| """Computes the Levenshtein distance of the two given strings. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or column name |
| first column value. |
| right : :class:`~pyspark.sql.Column` or column name |
| second column value. |
| threshold : int, optional |
| if set when the levenshtein distance of the two given strings |
| less than or equal to a given threshold then return result distance, or -1 |
| |
| .. versionadded: 3.5.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Levenshtein distance as integer value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r']) |
| >>> df.select('*', sf.levenshtein('l', 'r')).show() |
| +------+-------+-----------------+ |
| | l| r|levenshtein(l, r)| |
| +------+-------+-----------------+ |
| |kitten|sitting| 3| |
| +------+-------+-----------------+ |
| |
| >>> df.select('*', sf.levenshtein(df.l, df.r, 2)).show() |
| +------+-------+--------------------+ |
| | l| r|levenshtein(l, r, 2)| |
| +------+-------+--------------------+ |
| |kitten|sitting| -1| |
| +------+-------+--------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if threshold is None: |
| return _invoke_function_over_columns("levenshtein", left, right) |
| else: |
| return _invoke_function( |
| "levenshtein", _to_java_column(left), _to_java_column(right), _enum_to_value(threshold) |
| ) |
| |
| |
| @_try_remote_functions |
| def locate(substr: str, str: "ColumnOrName", pos: int = 1) -> Column: |
| """ |
| Locate the position of the first occurrence of substr in a string column, after position pos. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| substr : literal string |
| a string |
| str : :class:`~pyspark.sql.Column` or column name |
| a Column of :class:`pyspark.sql.types.StringType` |
| pos : int, optional |
| start position (zero based) |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| position of the substring. |
| |
| Notes |
| ----- |
| The position is not zero based, but 1 based index. Returns 0 if substr |
| could not be found in str. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.instr` |
| :meth:`pyspark.sql.functions.substr` |
| :meth:`pyspark.sql.functions.substring` |
| :meth:`pyspark.sql.functions.substring_index` |
| :meth:`pyspark.sql.Column.substr` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('abcd',)], ['s',]) |
| >>> df.select('*', sf.locate('b', 's', 1)).show() |
| +----+---------------+ |
| | s|locate(b, s, 1)| |
| +----+---------------+ |
| |abcd| 2| |
| +----+---------------+ |
| |
| >>> df.select('*', sf.locate('b', df.s, 3)).show() |
| +----+---------------+ |
| | s|locate(b, s, 3)| |
| +----+---------------+ |
| |abcd| 0| |
| +----+---------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "locate", _enum_to_value(substr), _to_java_column(str), _enum_to_value(pos) |
| ) |
| |
| |
| @_try_remote_functions |
| def lpad( |
| col: "ColumnOrName", |
| len: Union[Column, int], |
| pad: Union[Column, str], |
| ) -> Column: |
| """ |
| Left-pad the string column to width `len` with `pad`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| len : :class:`~pyspark.sql.Column` or int |
| length of the final string. |
| |
| .. versionchanged:: 4.0.0 |
| `pattern` now accepts column. |
| |
| pad : :class:`~pyspark.sql.Column` or literal string |
| chars to prepend. |
| |
| .. versionchanged:: 4.0.0 |
| `pattern` now accepts column. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| left padded result. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.rpad` |
| |
| Examples |
| -------- |
| Example 1: Pad with a literal string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('abcd',), ('xyz',), ('12',)], ['s',]) |
| >>> df.select("*", sf.lpad(df.s, 6, '#')).show() |
| +----+-------------+ |
| | s|lpad(s, 6, #)| |
| +----+-------------+ |
| |abcd| ##abcd| |
| | xyz| ###xyz| |
| | 12| ####12| |
| +----+-------------+ |
| |
| Example 2: Pad with a bytes column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('abcd',), ('xyz',), ('12',)], ['s',]) |
| >>> df.select("*", sf.lpad(df.s, 6, sf.lit(b"\x75\x76"))).show() |
| +----+-------------------+ |
| | s|lpad(s, 6, X'7576')| |
| +----+-------------------+ |
| |abcd| uvabcd| |
| | xyz| uvuxyz| |
| | 12| uvuv12| |
| +----+-------------------+ |
| """ |
| return _invoke_function_over_columns("lpad", col, lit(len), lit(pad)) |
| |
| |
| @_try_remote_functions |
| def rpad( |
| col: "ColumnOrName", |
| len: Union[Column, int], |
| pad: Union[Column, str], |
| ) -> Column: |
| """ |
| Right-pad the string column to width `len` with `pad`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target column to work on. |
| len : :class:`~pyspark.sql.Column` or int |
| length of the final string. |
| |
| .. versionchanged:: 4.0.0 |
| `pattern` now accepts column. |
| |
| pad : :class:`~pyspark.sql.Column` or literal string |
| chars to prepend. |
| |
| .. versionchanged:: 4.0.0 |
| `pattern` now accepts column. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| right padded result. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.lpad` |
| |
| Examples |
| -------- |
| Example 1: Pad with a literal string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('abcd',), ('xyz',), ('12',)], ['s',]) |
| >>> df.select("*", sf.rpad(df.s, 6, '#')).show() |
| +----+-------------+ |
| | s|rpad(s, 6, #)| |
| +----+-------------+ |
| |abcd| abcd##| |
| | xyz| xyz###| |
| | 12| 12####| |
| +----+-------------+ |
| |
| Example 2: Pad with a bytes column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('abcd',), ('xyz',), ('12',)], ['s',]) |
| >>> df.select("*", sf.rpad(df.s, 6, sf.lit(b"\x75\x76"))).show() |
| +----+-------------------+ |
| | s|rpad(s, 6, X'7576')| |
| +----+-------------------+ |
| |abcd| abcduv| |
| | xyz| xyzuvu| |
| | 12| 12uvuv| |
| +----+-------------------+ |
| """ |
| return _invoke_function_over_columns("rpad", col, lit(len), lit(pad)) |
| |
| |
| @_try_remote_functions |
| def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: |
| """ |
| Repeats a string column n times, and returns it as a new string column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| n : :class:`~pyspark.sql.Column` or column name or int |
| number of times to repeat value. |
| |
| .. versionchanged:: 4.0.0 |
| `n` now accepts column and column name. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string with repeated values. |
| |
| Examples |
| -------- |
| Example 1: Repeat with a constant number of times |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ab',)], ['s',]) |
| >>> df.select("*", sf.repeat("s", 3)).show() |
| +---+------------+ |
| | s|repeat(s, 3)| |
| +---+------------+ |
| | ab| ababab| |
| +---+------------+ |
| |
| >>> df.select("*", sf.repeat(df.s, sf.lit(4))).show() |
| +---+------------+ |
| | s|repeat(s, 4)| |
| +---+------------+ |
| | ab| abababab| |
| +---+------------+ |
| |
| Example 2: Repeat with a column containing different number of times |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ab', 5,), ('abc', 6,)], ['s', 't']) |
| >>> df.select("*", sf.repeat("s", "t")).show() |
| +---+---+------------------+ |
| | s| t| repeat(s, t)| |
| +---+---+------------------+ |
| | ab| 5| ababababab| |
| |abc| 6|abcabcabcabcabcabc| |
| +---+---+------------------+ |
| """ |
| n = _enum_to_value(n) |
| n = lit(n) if isinstance(n, int) else n |
| return _invoke_function_over_columns("repeat", col, n) |
| |
| |
| @_try_remote_functions |
| def split( |
| str: "ColumnOrName", |
| pattern: Union[Column, str], |
| limit: Union["ColumnOrName", int] = -1, |
| ) -> Column: |
| """ |
| Splits str around matches of the given pattern. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| a string expression to split |
| pattern : :class:`~pyspark.sql.Column` or literal string |
| a string representing a regular expression. The regex string should be |
| a Java regular expression. |
| |
| .. versionchanged:: 4.0.0 |
| `pattern` now accepts column. Does not accept column name since string type remain |
| accepted as a regular expression representation, for backwards compatibility. |
| In addition to int, `limit` now accepts column and column name. |
| |
| limit : :class:`~pyspark.sql.Column` or column name or int |
| an integer which controls the number of times `pattern` is applied. |
| |
| * ``limit > 0``: The resulting array's length will not be more than `limit`, and the |
| resulting array's last entry will contain all input beyond the last |
| matched pattern. |
| * ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting |
| array can be of any size. |
| |
| .. versionchanged:: 3.0 |
| `split` now takes an optional `limit` field. If not provided, default limit value is -1. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| array of separated strings. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.sentences` |
| :meth:`pyspark.sql.functions.split_part` |
| |
| Examples |
| -------- |
| Example 1: Repeat with a constant pattern |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) |
| >>> df.select('*', sf.split(df.s, '[ABC]')).show() |
| +--------------+-------------------+ |
| | s|split(s, [ABC], -1)| |
| +--------------+-------------------+ |
| |oneAtwoBthreeC|[one, two, three, ]| |
| +--------------+-------------------+ |
| |
| >>> df.select('*', sf.split(df.s, '[ABC]', 2)).show() |
| +--------------+------------------+ |
| | s|split(s, [ABC], 2)| |
| +--------------+------------------+ |
| |oneAtwoBthreeC| [one, twoBthreeC]| |
| +--------------+------------------+ |
| |
| >>> df.select('*', sf.split('s', '[ABC]', -2)).show() |
| +--------------+-------------------+ |
| | s|split(s, [ABC], -2)| |
| +--------------+-------------------+ |
| |oneAtwoBthreeC|[one, two, three, ]| |
| +--------------+-------------------+ |
| |
| Example 2: Repeat with a column containing different patterns and limits |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([ |
| ... ('oneAtwoBthreeC', '[ABC]', 2), |
| ... ('1A2B3C', '[1-9]+', 1), |
| ... ('aa2bb3cc4', '[1-9]+', -1)], ['s', 'p', 'l']) |
| >>> df.select('*', sf.split(df.s, df.p)).show() |
| +--------------+------+---+-------------------+ |
| | s| p| l| split(s, p, -1)| |
| +--------------+------+---+-------------------+ |
| |oneAtwoBthreeC| [ABC]| 2|[one, two, three, ]| |
| | 1A2B3C|[1-9]+| 1| [, A, B, C]| |
| | aa2bb3cc4|[1-9]+| -1| [aa, bb, cc, ]| |
| +--------------+------+---+-------------------+ |
| |
| >>> df.select(sf.split('s', df.p, 'l')).show() |
| +-----------------+ |
| | split(s, p, l)| |
| +-----------------+ |
| |[one, twoBthreeC]| |
| | [1A2B3C]| |
| | [aa, bb, cc, ]| |
| +-----------------+ |
| """ |
| limit = _enum_to_value(limit) |
| limit = lit(limit) if isinstance(limit, int) else limit |
| return _invoke_function_over_columns("split", str, lit(pattern), limit) |
| |
| |
| @_try_remote_functions |
| def rlike(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: |
| r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or column name |
| regex pattern to apply. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if `str` matches a Java regex, or false otherwise. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("1a 2b 14m", r"(\d+)")], ["str", "regexp"]) |
| >>> df.select('*', sf.rlike('str', sf.lit(r'(\d+)'))).show() |
| +---------+------+-----------------+ |
| | str|regexp|RLIKE(str, (\d+))| |
| +---------+------+-----------------+ |
| |1a 2b 14m| (\d+)| true| |
| +---------+------+-----------------+ |
| |
| >>> df.select('*', sf.rlike('str', sf.lit(r'\d{2}b'))).show() |
| +---------+------+------------------+ |
| | str|regexp|RLIKE(str, \d{2}b)| |
| +---------+------+------------------+ |
| |1a 2b 14m| (\d+)| false| |
| +---------+------+------------------+ |
| |
| >>> df.select('*', sf.rlike("str", sf.col("regexp"))).show() |
| +---------+------+------------------+ |
| | str|regexp|RLIKE(str, regexp)| |
| +---------+------+------------------+ |
| |1a 2b 14m| (\d+)| true| |
| +---------+------+------------------+ |
| |
| >>> df.select('*', sf.rlike("str", "regexp")).show() |
| +---------+------+------------------+ |
| | str|regexp|RLIKE(str, regexp)| |
| +---------+------+------------------+ |
| |1a 2b 14m| (\d+)| true| |
| +---------+------+------------------+ |
| """ |
| return _invoke_function_over_columns("rlike", str, regexp) |
| |
| |
| @_try_remote_functions |
| def regexp(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: |
| r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or str |
| regex pattern to apply. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if `str` matches a Java regex, or false otherwise. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"] |
| ... ).select(sf.regexp('str', sf.lit(r'(\d+)'))).show() |
| +------------------+ |
| |REGEXP(str, (\d+))| |
| +------------------+ |
| | true| |
| +------------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"] |
| ... ).select(sf.regexp('str', sf.lit(r'\d{2}b'))).show() |
| +-------------------+ |
| |REGEXP(str, \d{2}b)| |
| +-------------------+ |
| | false| |
| +-------------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"] |
| ... ).select(sf.regexp('str', sf.col("regexp"))).show() |
| +-------------------+ |
| |REGEXP(str, regexp)| |
| +-------------------+ |
| | true| |
| +-------------------+ |
| """ |
| return _invoke_function_over_columns("regexp", str, regexp) |
| |
| |
| @_try_remote_functions |
| def regexp_like(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: |
| r"""Returns true if `str` matches the Java regex `regexp`, or false otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or str |
| regex pattern to apply. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| true if `str` matches a Java regex, or false otherwise. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"] |
| ... ).select(sf.regexp_like('str', sf.lit(r'(\d+)'))).show() |
| +-----------------------+ |
| |REGEXP_LIKE(str, (\d+))| |
| +-----------------------+ |
| | true| |
| +-----------------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"] |
| ... ).select(sf.regexp_like('str', sf.lit(r'\d{2}b'))).show() |
| +------------------------+ |
| |REGEXP_LIKE(str, \d{2}b)| |
| +------------------------+ |
| | false| |
| +------------------------+ |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"] |
| ... ).select(sf.regexp_like('str', sf.col("regexp"))).show() |
| +------------------------+ |
| |REGEXP_LIKE(str, regexp)| |
| +------------------------+ |
| | true| |
| +------------------------+ |
| """ |
| return _invoke_function_over_columns("regexp_like", str, regexp) |
| |
| |
| @_try_remote_functions |
| def randstr(length: Union[Column, int], seed: Optional[Union[Column, int]] = None) -> Column: |
| """Returns a string of the specified length whose characters are chosen uniformly at random from |
| the following pool of characters: 0-9, a-z, A-Z. The random seed is optional. The string length |
| must be a constant two-byte or four-byte integer (SMALLINT or INT, respectively). |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| length : :class:`~pyspark.sql.Column` or int |
| Number of characters in the string to generate. |
| seed : :class:`~pyspark.sql.Column` or int |
| Optional random number seed to use. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The generated random string with the specified length. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.rand` |
| :meth:`pyspark.sql.functions.randn` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(0, 10, 1, 1).select(sf.randstr(16, 3)).show() |
| +----------------+ |
| | randstr(16, 3)| |
| +----------------+ |
| |nurJIpH4cmmMnsCG| |
| |fl9YtT5m01trZtIt| |
| |PD19rAgscTHS7qQZ| |
| |2CuAICF5UJOruVv4| |
| |kNZEs8nDpJEoz3Rl| |
| |OXiU0KN5eaXfjXFs| |
| |qfnTM1BZAHtN0gBV| |
| |1p8XiSKwg33KnRPK| |
| |od5y5MucayQq1bKK| |
| |tklYPmKmc5sIppWM| |
| +----------------+ |
| """ |
| length = _enum_to_value(length) |
| length = lit(length) |
| if seed is None: |
| return _invoke_function_over_columns("randstr", length) |
| else: |
| seed = _enum_to_value(seed) |
| seed = lit(seed) |
| return _invoke_function_over_columns("randstr", length, seed) |
| |
| |
| @_try_remote_functions |
| def regexp_count(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: |
| r"""Returns a count of the number of times that the Java regex pattern `regexp` is matched |
| in the string `str`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or column name |
| regex pattern to apply. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the number of times that a Java regex pattern is matched in the string. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"]) |
| >>> df.select('*', sf.regexp_count('str', sf.lit(r'\d+'))).show() |
| +---------+------+----------------------+ |
| | str|regexp|regexp_count(str, \d+)| |
| +---------+------+----------------------+ |
| |1a 2b 14m| \d+| 3| |
| +---------+------+----------------------+ |
| |
| >>> df.select('*', sf.regexp_count('str', sf.lit(r'mmm'))).show() |
| +---------+------+----------------------+ |
| | str|regexp|regexp_count(str, mmm)| |
| +---------+------+----------------------+ |
| |1a 2b 14m| \d+| 0| |
| +---------+------+----------------------+ |
| |
| >>> df.select('*', sf.regexp_count("str", sf.col("regexp"))).show() |
| +---------+------+-------------------------+ |
| | str|regexp|regexp_count(str, regexp)| |
| +---------+------+-------------------------+ |
| |1a 2b 14m| \d+| 3| |
| +---------+------+-------------------------+ |
| |
| >>> df.select('*', sf.regexp_count(sf.col('str'), "regexp")).show() |
| +---------+------+-------------------------+ |
| | str|regexp|regexp_count(str, regexp)| |
| +---------+------+-------------------------+ |
| |1a 2b 14m| \d+| 3| |
| +---------+------+-------------------------+ |
| """ |
| return _invoke_function_over_columns("regexp_count", str, regexp) |
| |
| |
| @_try_remote_functions |
| def regexp_extract(str: "ColumnOrName", pattern: str, idx: int) -> Column: |
| r"""Extract a specific group matched by the Java regex `regexp`, from the specified string column. |
| If the regex did not match, or the specified group did not match, an empty string is returned. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| pattern : str |
| regex pattern to apply. |
| idx : int |
| matched group id. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| matched value specified by `idx` group id. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regexp_extract_all` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('100-200',)], ['str']) |
| >>> df.select('*', sf.regexp_extract('str', r'(\d+)-(\d+)', 1)).show() |
| +-------+-----------------------------------+ |
| | str|regexp_extract(str, (\d+)-(\d+), 1)| |
| +-------+-----------------------------------+ |
| |100-200| 100| |
| +-------+-----------------------------------+ |
| |
| >>> df = spark.createDataFrame([('foo',)], ['str']) |
| >>> df.select('*', sf.regexp_extract('str', r'(\d+)', 1)).show() |
| +---+-----------------------------+ |
| |str|regexp_extract(str, (\d+), 1)| |
| +---+-----------------------------+ |
| |foo| | |
| +---+-----------------------------+ |
| |
| >>> df = spark.createDataFrame([('aaaac',)], ['str']) |
| >>> df.select('*', sf.regexp_extract(sf.col('str'), '(a+)(b)?(c)', 2)).show() |
| +-----+-----------------------------------+ |
| | str|regexp_extract(str, (a+)(b)?(c), 2)| |
| +-----+-----------------------------------+ |
| |aaaac| | |
| +-----+-----------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "regexp_extract", _to_java_column(str), _enum_to_value(pattern), _enum_to_value(idx) |
| ) |
| |
| |
| @_try_remote_functions |
| def regexp_extract_all( |
| str: "ColumnOrName", regexp: "ColumnOrName", idx: Optional[Union[int, Column]] = None |
| ) -> Column: |
| r"""Extract all strings in the `str` that match the Java regex `regexp` |
| and corresponding to the regex group index. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or column name |
| regex pattern to apply. |
| idx : :class:`~pyspark.sql.Column` or int, optional |
| matched group id. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| all strings in the `str` that match a Java regex and corresponding to the regex group index. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.regexp_extract` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")], ["str", "regexp"]) |
| >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'))).show() |
| +----------------+-----------+---------------------------------------+ |
| | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 1)| |
| +----------------+-----------+---------------------------------------+ |
| |100-200, 300-400|(\d+)-(\d+)| [100, 300]| |
| +----------------+-----------+---------------------------------------+ |
| |
| >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'), sf.lit(1))).show() |
| +----------------+-----------+---------------------------------------+ |
| | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 1)| |
| +----------------+-----------+---------------------------------------+ |
| |100-200, 300-400|(\d+)-(\d+)| [100, 300]| |
| +----------------+-----------+---------------------------------------+ |
| |
| >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'), 2)).show() |
| +----------------+-----------+---------------------------------------+ |
| | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 2)| |
| +----------------+-----------+---------------------------------------+ |
| |100-200, 300-400|(\d+)-(\d+)| [200, 400]| |
| +----------------+-----------+---------------------------------------+ |
| |
| >>> df.select('*', sf.regexp_extract_all('str', sf.col("regexp"))).show() |
| +----------------+-----------+----------------------------------+ |
| | str| regexp|regexp_extract_all(str, regexp, 1)| |
| +----------------+-----------+----------------------------------+ |
| |100-200, 300-400|(\d+)-(\d+)| [100, 300]| |
| +----------------+-----------+----------------------------------+ |
| |
| >>> df.select('*', sf.regexp_extract_all(sf.col('str'), "regexp")).show() |
| +----------------+-----------+----------------------------------+ |
| | str| regexp|regexp_extract_all(str, regexp, 1)| |
| +----------------+-----------+----------------------------------+ |
| |100-200, 300-400|(\d+)-(\d+)| [100, 300]| |
| +----------------+-----------+----------------------------------+ |
| """ |
| if idx is None: |
| return _invoke_function_over_columns("regexp_extract_all", str, regexp) |
| else: |
| return _invoke_function_over_columns("regexp_extract_all", str, regexp, lit(idx)) |
| |
| |
| @_try_remote_functions |
| def regexp_replace( |
| string: "ColumnOrName", pattern: Union[str, Column], replacement: Union[str, Column] |
| ) -> Column: |
| r"""Replace all substrings of the specified string value that match regexp with replacement. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| string : :class:`~pyspark.sql.Column` or str |
| column name or column containing the string value |
| pattern : :class:`~pyspark.sql.Column` or str |
| column object or str containing the regexp pattern |
| replacement : :class:`~pyspark.sql.Column` or str |
| column object or str containing the replacement |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string with all substrings replaced. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("100-200", r"(\d+)", "--")], |
| ... ["str", "pattern", "replacement"] |
| ... ) |
| |
| Example 1: Replaces all the substrings in the `str` column name that |
| match the regex pattern `(\d+)` (one or more digits) with the replacement |
| string "--". |
| |
| >>> df.select('*', sf.regexp_replace('str', r'(\d+)', '--')).show() |
| +-------+-------+-----------+---------------------------------+ |
| | str|pattern|replacement|regexp_replace(str, (\d+), --, 1)| |
| +-------+-------+-----------+---------------------------------+ |
| |100-200| (\d+)| --| -----| |
| +-------+-------+-----------+---------------------------------+ |
| |
| Example 2: Replaces all the substrings in the `str` Column that match |
| the regex pattern in the `pattern` Column with the string in the `replacement` |
| column. |
| |
| >>> df.select('*', \ |
| ... sf.regexp_replace(sf.col("str"), sf.col("pattern"), sf.col("replacement")) \ |
| ... ).show() |
| +-------+-------+-----------+--------------------------------------------+ |
| | str|pattern|replacement|regexp_replace(str, pattern, replacement, 1)| |
| +-------+-------+-----------+--------------------------------------------+ |
| |100-200| (\d+)| --| -----| |
| +-------+-------+-----------+--------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("regexp_replace", string, lit(pattern), lit(replacement)) |
| |
| |
| @_try_remote_functions |
| def regexp_substr(str: "ColumnOrName", regexp: "ColumnOrName") -> Column: |
| r"""Returns the first substring that matches the Java regex `regexp` within the string `str`. |
| If the regular expression is not found, the result is null. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or column name |
| regex pattern to apply. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the first substring that matches a Java regex within the string `str`. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"]) |
| |
| Example 1: Returns the first substring in the `str` column name that |
| matches the regex pattern `(\d+)` (one or more digits). |
| |
| >>> df.select('*', sf.regexp_substr('str', sf.lit(r'\d+'))).show() |
| +---------+------+-----------------------+ |
| | str|regexp|regexp_substr(str, \d+)| |
| +---------+------+-----------------------+ |
| |1a 2b 14m| \d+| 1| |
| +---------+------+-----------------------+ |
| |
| Example 2: Returns the first substring in the `str` column name that |
| matches the regex pattern `(mmm)` (three consecutive 'm' characters) |
| |
| >>> df.select('*', sf.regexp_substr('str', sf.lit(r'mmm'))).show() |
| +---------+------+-----------------------+ |
| | str|regexp|regexp_substr(str, mmm)| |
| +---------+------+-----------------------+ |
| |1a 2b 14m| \d+| NULL| |
| +---------+------+-----------------------+ |
| |
| Example 3: Returns the first substring in the `str` column name that |
| matches the regex pattern in `regexp` Column. |
| |
| >>> df.select('*', sf.regexp_substr("str", sf.col("regexp"))).show() |
| +---------+------+--------------------------+ |
| | str|regexp|regexp_substr(str, regexp)| |
| +---------+------+--------------------------+ |
| |1a 2b 14m| \d+| 1| |
| +---------+------+--------------------------+ |
| |
| Example 4: Returns the first substring in the `str` Column that |
| matches the regex pattern in `regexp` column name. |
| |
| >>> df.select('*', sf.regexp_substr(sf.col("str"), "regexp")).show() |
| +---------+------+--------------------------+ |
| | str|regexp|regexp_substr(str, regexp)| |
| +---------+------+--------------------------+ |
| |1a 2b 14m| \d+| 1| |
| +---------+------+--------------------------+ |
| """ |
| return _invoke_function_over_columns("regexp_substr", str, regexp) |
| |
| |
| @_try_remote_functions |
| def regexp_instr( |
| str: "ColumnOrName", regexp: "ColumnOrName", idx: Optional[Union[int, Column]] = None |
| ) -> Column: |
| r"""Returns the position of the first substring in the `str` that match the Java regex `regexp` |
| and corresponding to the regex group index. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| regexp : :class:`~pyspark.sql.Column` or column name |
| regex pattern to apply. |
| idx : :class:`~pyspark.sql.Column` or int, optional |
| matched group id. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| the position of the first substring in the `str` that match a Java regex and corresponding |
| to the regex group index. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str", "regexp"]) |
| |
| Example 1: Returns the position of the first substring in the `str` column name that |
| match the regex pattern `(\d+(a|b|m))` (one or more digits followed by 'a', 'b', or 'm'). |
| |
| >>> df.select('*', sf.regexp_instr('str', sf.lit(r'\d+(a|b|m)'))).show() |
| +---------+----------+--------------------------------+ |
| | str| regexp|regexp_instr(str, \d+(a|b|m), 0)| |
| +---------+----------+--------------------------------+ |
| |1a 2b 14m|\d+(a|b|m)| 1| |
| +---------+----------+--------------------------------+ |
| |
| Example 2: Returns the position of the first substring in the `str` column name that |
| match the regex pattern `(\d+(a|b|m))` (one or more digits followed by 'a', 'b', or 'm'), |
| |
| >>> df.select('*', sf.regexp_instr('str', sf.lit(r'\d+(a|b|m)'), sf.lit(1))).show() |
| +---------+----------+--------------------------------+ |
| | str| regexp|regexp_instr(str, \d+(a|b|m), 1)| |
| +---------+----------+--------------------------------+ |
| |1a 2b 14m|\d+(a|b|m)| 1| |
| +---------+----------+--------------------------------+ |
| |
| Example 3: Returns the position of the first substring in the `str` column name that |
| match the regex pattern in `regexp` Column. |
| |
| >>> df.select('*', sf.regexp_instr('str', sf.col("regexp"))).show() |
| +---------+----------+----------------------------+ |
| | str| regexp|regexp_instr(str, regexp, 0)| |
| +---------+----------+----------------------------+ |
| |1a 2b 14m|\d+(a|b|m)| 1| |
| +---------+----------+----------------------------+ |
| |
| Example 4: Returns the position of the first substring in the `str` Column that |
| match the regex pattern in `regexp` column name. |
| |
| >>> df.select('*', sf.regexp_instr(sf.col("str"), "regexp")).show() |
| +---------+----------+----------------------------+ |
| | str| regexp|regexp_instr(str, regexp, 0)| |
| +---------+----------+----------------------------+ |
| |1a 2b 14m|\d+(a|b|m)| 1| |
| +---------+----------+----------------------------+ |
| """ |
| if idx is None: |
| return _invoke_function_over_columns("regexp_instr", str, regexp) |
| else: |
| return _invoke_function_over_columns("regexp_instr", str, regexp, lit(idx)) |
| |
| |
| @_try_remote_functions |
| def initcap(col: "ColumnOrName") -> Column: |
| """Translate the first letter of each word to upper case in the sentence. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string with all first letters are uppercase in each word. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ab cd',)], ['a']) |
| >>> df.select("*", sf.initcap("a")).show() |
| +-----+----------+ |
| | a|initcap(a)| |
| +-----+----------+ |
| |ab cd| Ab Cd| |
| +-----+----------+ |
| """ |
| return _invoke_function_over_columns("initcap", col) |
| |
| |
| @_try_remote_functions |
| def soundex(col: "ColumnOrName") -> Column: |
| """ |
| Returns the SoundEx encoding for a string |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| SoundEx encoded string. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ["s"]) |
| >>> df.select("*", sf.soundex("s")).show() |
| +-------+----------+ |
| | s|soundex(s)| |
| +-------+----------+ |
| | Peters| P362| |
| |Uhrbach| U612| |
| +-------+----------+ |
| """ |
| return _invoke_function_over_columns("soundex", col) |
| |
| |
| @_try_remote_functions |
| def bin(col: "ColumnOrName") -> Column: |
| """Returns the string representation of the binary value of the given column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| binary representation of given value as string. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(10).select("*", sf.bin("id")).show() |
| +---+-------+ |
| | id|bin(id)| |
| +---+-------+ |
| | 0| 0| |
| | 1| 1| |
| | 2| 10| |
| | 3| 11| |
| | 4| 100| |
| | 5| 101| |
| | 6| 110| |
| | 7| 111| |
| | 8| 1000| |
| | 9| 1001| |
| +---+-------+ |
| """ |
| return _invoke_function_over_columns("bin", col) |
| |
| |
| @_try_remote_functions |
| def hex(col: "ColumnOrName") -> Column: |
| """Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`, |
| :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or |
| :class:`pyspark.sql.types.LongType`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.unhex` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| hexadecimal representation of given value as string. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('ABC', 3)], ['a', 'b']) |
| >>> df.select('*', sf.hex('a'), sf.hex(df.b)).show() |
| +---+---+------+------+ |
| | a| b|hex(a)|hex(b)| |
| +---+---+------+------+ |
| |ABC| 3|414243| 3| |
| +---+---+------+------+ |
| """ |
| return _invoke_function_over_columns("hex", col) |
| |
| |
| @_try_remote_functions |
| def unhex(col: "ColumnOrName") -> Column: |
| """Inverse of hex. Interprets each pair of characters as a hexadecimal number |
| and converts to the byte representation of number. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.hex` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string representation of given hexadecimal value. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('414243',)], ['a']) |
| >>> df.select('*', sf.unhex('a')).show() |
| +------+----------+ |
| | a| unhex(a)| |
| +------+----------+ |
| |414243|[41 42 43]| |
| +------+----------+ |
| """ |
| return _invoke_function_over_columns("unhex", col) |
| |
| |
| @_try_remote_functions |
| def uniform( |
| min: Union[Column, int, float], |
| max: Union[Column, int, float], |
| seed: Optional[Union[Column, int]] = None, |
| ) -> Column: |
| """Returns a random value with independent and identically distributed (i.i.d.) values with the |
| specified range of numbers. The random seed is optional. The provided numbers specifying the |
| minimum and maximum values of the range must be constant. If both of these numbers are integers, |
| then the result will also be an integer. Otherwise if one or both of these are floating-point |
| numbers, then the result will also be a floating-point number. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| min : :class:`~pyspark.sql.Column`, int, or float |
| Minimum value in the range. |
| max : :class:`~pyspark.sql.Column`, int, or float |
| Maximum value in the range. |
| seed : :class:`~pyspark.sql.Column` or int |
| Optional random number seed to use. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The generated random number within the specified range. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(0, 10, 1, 1).select(sf.uniform(5, 105, 3)).show() |
| +------------------+ |
| |uniform(5, 105, 3)| |
| +------------------+ |
| | 30| |
| | 71| |
| | 99| |
| | 77| |
| | 16| |
| | 25| |
| | 89| |
| | 80| |
| | 51| |
| | 83| |
| +------------------+ |
| """ |
| min = _enum_to_value(min) |
| min = lit(min) |
| max = _enum_to_value(max) |
| max = lit(max) |
| if seed is None: |
| return _invoke_function_over_columns("uniform", min, max) |
| else: |
| seed = _enum_to_value(seed) |
| seed = lit(seed) |
| return _invoke_function_over_columns("uniform", min, max, seed) |
| |
| |
| @_try_remote_functions |
| def length(col: "ColumnOrName") -> Column: |
| """Computes the character length of string data or number of bytes of binary data. |
| The length of character data includes the trailing spaces. The length of binary data |
| includes binary zeros. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| length of the value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.createDataFrame([('ABC ',)], ['a']).select('*', sf.length('a')).show() |
| +----+---------+ |
| | a|length(a)| |
| +----+---------+ |
| |ABC | 4| |
| +----+---------+ |
| """ |
| return _invoke_function_over_columns("length", col) |
| |
| |
| @_try_remote_functions |
| def octet_length(col: "ColumnOrName") -> Column: |
| """ |
| Calculates the byte length for the specified string column. |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Source column or strings |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Byte length of the col |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) |
| >>> df.select('*', sf.octet_length('cat')).show() |
| +---+-----------------+ |
| |cat|octet_length(cat)| |
| +---+-----------------+ |
| |cat| 3| |
| | 🐈| 4| |
| +---+-----------------+ |
| """ |
| return _invoke_function_over_columns("octet_length", col) |
| |
| |
| @_try_remote_functions |
| def bit_length(col: "ColumnOrName") -> Column: |
| """ |
| Calculates the bit length for the specified string column. |
| |
| .. versionadded:: 3.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Source column or strings |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Bit length of the col |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) |
| >>> df.select('*', sf.bit_length('cat')).show() |
| +---+---------------+ |
| |cat|bit_length(cat)| |
| +---+---------------+ |
| |cat| 24| |
| | 🐈| 32| |
| +---+---------------+ |
| """ |
| return _invoke_function_over_columns("bit_length", col) |
| |
| |
| @_try_remote_functions |
| def translate(srcCol: "ColumnOrName", matching: str, replace: str) -> Column: |
| """A function translate any character in the `srcCol` by a character in `matching`. |
| The characters in `replace` is corresponding to the characters in `matching`. |
| Translation will happen whenever any character in the string is matching with the character |
| in the `matching`. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| srcCol : :class:`~pyspark.sql.Column` or column name |
| Source column or strings |
| matching : str |
| matching characters. |
| replace : str |
| characters for replacement. If this is shorter than `matching` string then |
| those chars that don't have replacement will be dropped. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| replaced value. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('translate',)], ['a']) |
| >>> df.select('*', sf.translate('a', "rnlt", "123")).show() |
| +---------+-----------------------+ |
| | a|translate(a, rnlt, 123)| |
| +---------+-----------------------+ |
| |translate| 1a2s3ae| |
| +---------+-----------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function( |
| "translate", _to_java_column(srcCol), _enum_to_value(matching), _enum_to_value(replace) |
| ) |
| |
| |
| @_try_remote_functions |
| def to_binary(col: "ColumnOrName", format: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| Converts the input `col` to a binary value based on the supplied `format`. |
| The `format` can be a case-insensitive string literal of "hex", "utf-8", "utf8", |
| or "base64". By default, the binary format for conversion is "hex" if |
| `format` is omitted. The function returns NULL if at least one of the |
| input parameters is NULL. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or str, optional |
| format to use to convert binary values. |
| |
| Examples |
| -------- |
| Example 1: Convert string to a binary with encoding specified |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("abc",)], ["e"]) |
| >>> df.select(sf.try_to_binary(df.e, sf.lit("utf-8")).alias('r')).collect() |
| [Row(r=bytearray(b'abc'))] |
| |
| Example 2: Convert string to a timestamp without encoding specified |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("414243",)], ["e"]) |
| >>> df.select(sf.try_to_binary(df.e).alias('r')).collect() |
| [Row(r=bytearray(b'ABC'))] |
| """ |
| if format is not None: |
| return _invoke_function_over_columns("to_binary", col, format) |
| else: |
| return _invoke_function_over_columns("to_binary", col) |
| |
| |
| @_try_remote_functions |
| def to_char(col: "ColumnOrName", format: "ColumnOrName") -> Column: |
| """ |
| Convert `col` to a string based on the `format`. |
| Throws an exception if the conversion fails. The format can consist of the following |
| characters, case insensitive: |
| '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the |
| format string matches a sequence of digits in the input value, generating a result |
| string of the same length as the corresponding sequence in the format string. |
| The result string is left-padded with zeros if the 0/9 sequence comprises more digits |
| than the matching part of the decimal value, starts with 0, and is before the decimal |
| point. Otherwise, it is padded with spaces. |
| '.' or 'D': Specifies the position of the decimal point (optional, only allowed once). |
| ',' or 'G': Specifies the position of the grouping (thousands) separator (,). |
| There must be a 0 or 9 to the left and right of each grouping separator. |
| '$': Specifies the location of the $ currency sign. This character may only be specified once. |
| 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at |
| the beginning or end of the format string). Note that 'S' prints '+' for positive |
| values but 'MI' prints a space. |
| 'PR': Only allowed at the end of the format string; specifies that the result string |
| will be wrapped by angle brackets if the input value is negative. |
| If `col` is a datetime, `format` shall be a valid datetime pattern, see |
| <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Patterns</a>. |
| If `col` is a binary, it is converted to a string in one of the formats: |
| 'base64': a base 64 string. |
| 'hex': a string in the hexadecimal format. |
| 'utf-8': the input binary is decoded to UTF-8 string. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or str, optional |
| format to use to convert char values. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(78.12,)], ["e"]) |
| >>> df.select(to_char(df.e, lit("$99.99")).alias('r')).collect() |
| [Row(r='$78.12')] |
| """ |
| return _invoke_function_over_columns("to_char", col, format) |
| |
| |
| @_try_remote_functions |
| def to_varchar(col: "ColumnOrName", format: "ColumnOrName") -> Column: |
| """ |
| Convert `col` to a string based on the `format`. |
| Throws an exception if the conversion fails. The format can consist of the following |
| characters, case insensitive: |
| '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the |
| format string matches a sequence of digits in the input value, generating a result |
| string of the same length as the corresponding sequence in the format string. |
| The result string is left-padded with zeros if the 0/9 sequence comprises more digits |
| than the matching part of the decimal value, starts with 0, and is before the decimal |
| point. Otherwise, it is padded with spaces. |
| '.' or 'D': Specifies the position of the decimal point (optional, only allowed once). |
| ',' or 'G': Specifies the position of the grouping (thousands) separator (,). |
| There must be a 0 or 9 to the left and right of each grouping separator. |
| '$': Specifies the location of the $ currency sign. This character may only be specified once. |
| 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at |
| the beginning or end of the format string). Note that 'S' prints '+' for positive |
| values but 'MI' prints a space. |
| 'PR': Only allowed at the end of the format string; specifies that the result string |
| will be wrapped by angle brackets if the input value is negative. |
| If `col` is a datetime, `format` shall be a valid datetime pattern, see |
| <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Patterns</a>. |
| If `col` is a binary, it is converted to a string in one of the formats: |
| 'base64': a base 64 string. |
| 'hex': a string in the hexadecimal format. |
| 'utf-8': the input binary is decoded to UTF-8 string. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or str, optional |
| format to use to convert char values. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(78.12,)], ["e"]) |
| >>> df.select(to_varchar(df.e, lit("$99.99")).alias('r')).collect() |
| [Row(r='$78.12')] |
| """ |
| return _invoke_function_over_columns("to_varchar", col, format) |
| |
| |
| @_try_remote_functions |
| def to_number(col: "ColumnOrName", format: "ColumnOrName") -> Column: |
| """ |
| Convert string 'col' to a number based on the string format 'format'. |
| Throws an exception if the conversion fails. The format can consist of the following |
| characters, case insensitive: |
| '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the |
| format string matches a sequence of digits in the input string. If the 0/9 |
| sequence starts with 0 and is before the decimal point, it can only match a digit |
| sequence of the same size. Otherwise, if the sequence starts with 9 or is after |
| the decimal point, it can match a digit sequence that has the same or smaller size. |
| '.' or 'D': Specifies the position of the decimal point (optional, only allowed once). |
| ',' or 'G': Specifies the position of the grouping (thousands) separator (,). |
| There must be a 0 or 9 to the left and right of each grouping separator. |
| 'col' must match the grouping separator relevant for the size of the number. |
| '$': Specifies the location of the $ currency sign. This character may only be |
| specified once. |
| 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed |
| once at the beginning or end of the format string). Note that 'S' allows '-' |
| but 'MI' does not. |
| 'PR': Only allowed at the end of the format string; specifies that 'col' indicates a |
| negative number with wrapping angled brackets. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or str, optional |
| format to use to convert number values. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("$78.12",)], ["e"]) |
| >>> df.select(to_number(df.e, lit("$99.99")).alias('r')).collect() |
| [Row(r=Decimal('78.12'))] |
| """ |
| return _invoke_function_over_columns("to_number", col, format) |
| |
| |
| @_try_remote_functions |
| def replace( |
| src: "ColumnOrName", search: "ColumnOrName", replace: Optional["ColumnOrName"] = None |
| ) -> Column: |
| """ |
| Replaces all occurrences of `search` with `replace`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| src : :class:`~pyspark.sql.Column` or str |
| A column of string to be replaced. |
| search : :class:`~pyspark.sql.Column` or str |
| A column of string, If `search` is not found in `str`, `str` is returned unchanged. |
| replace : :class:`~pyspark.sql.Column` or str, optional |
| A column of string, If `replace` is not specified or is an empty string, |
| nothing replaces the string that is removed from `str`. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("ABCabc", "abc", "DEF",)], ["a", "b", "c"]) |
| >>> df.select(replace(df.a, df.b, df.c).alias('r')).collect() |
| [Row(r='ABCDEF')] |
| |
| >>> df.select(replace(df.a, df.b).alias('r')).collect() |
| [Row(r='ABC')] |
| """ |
| if replace is not None: |
| return _invoke_function_over_columns("replace", src, search, replace) |
| else: |
| return _invoke_function_over_columns("replace", src, search) |
| |
| |
| @_try_remote_functions |
| def split_part(src: "ColumnOrName", delimiter: "ColumnOrName", partNum: "ColumnOrName") -> Column: |
| """ |
| Splits `str` by delimiter and return requested part of the split (1-based). |
| If any input is null, returns null. if `partNum` is out of range of split parts, |
| returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative, |
| the parts are counted backward from the end of the string. |
| If the `delimiter` is an empty string, the `str` is not split. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| src : :class:`~pyspark.sql.Column` or column name |
| A column of string to be split. |
| delimiter : :class:`~pyspark.sql.Column` or column name |
| A column of string, the delimiter used for split. |
| partNum : :class:`~pyspark.sql.Column` or column name |
| A column of string, requested part of the split (1-based). |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.sentences` |
| :meth:`pyspark.sql.functions.split` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"]) |
| >>> df.select("*", sf.split_part("a", "b", "c")).show() |
| +--------+---+---+-------------------+ |
| | a| b| c|split_part(a, b, c)| |
| +--------+---+---+-------------------+ |
| |11.12.13| .| 3| 13| |
| +--------+---+---+-------------------+ |
| |
| >>> df.select("*", sf.split_part(df.a, df.b, sf.lit(-2))).show() |
| +--------+---+---+--------------------+ |
| | a| b| c|split_part(a, b, -2)| |
| +--------+---+---+--------------------+ |
| |11.12.13| .| 3| 12| |
| +--------+---+---+--------------------+ |
| """ |
| return _invoke_function_over_columns("split_part", src, delimiter, partNum) |
| |
| |
| @_try_remote_functions |
| def substr( |
| str: "ColumnOrName", pos: "ColumnOrName", len: Optional["ColumnOrName"] = None |
| ) -> Column: |
| """ |
| Returns the substring of `str` that starts at `pos` and is of length `len`, |
| or the slice of byte array that starts at `pos` and is of length `len`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or column name |
| A column of string. |
| pos : :class:`~pyspark.sql.Column` or column name |
| A column of string, the substring of `str` that starts at `pos`. |
| len : :class:`~pyspark.sql.Column` or column name, optional |
| A column of string, the substring of `str` is of length `len`. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| substring of given value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.instr` |
| :meth:`pyspark.sql.functions.substring` |
| :meth:`pyspark.sql.functions.substring_index` |
| :meth:`pyspark.sql.Column.substr` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Spark SQL", 5, 1,)], ["a", "b", "c"]) |
| >>> df.select("*", sf.substr("a", "b", "c")).show() |
| +---------+---+---+---------------+ |
| | a| b| c|substr(a, b, c)| |
| +---------+---+---+---------------+ |
| |Spark SQL| 5| 1| k| |
| +---------+---+---+---------------+ |
| |
| >>> df.select("*", sf.substr(df.a, df.b)).show() |
| +---------+---+---+------------------------+ |
| | a| b| c|substr(a, b, 2147483647)| |
| +---------+---+---+------------------------+ |
| |Spark SQL| 5| 1| k SQL| |
| +---------+---+---+------------------------+ |
| """ |
| if len is not None: |
| return _invoke_function_over_columns("substr", str, pos, len) |
| else: |
| return _invoke_function_over_columns("substr", str, pos) |
| |
| |
| @_try_remote_functions |
| def try_parse_url( |
| url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None |
| ) -> Column: |
| """ |
| This is a special version of `parse_url` that performs the same operation, but returns a |
| NULL value instead of raising an error if the parsing cannot be performed. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| url : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing a URL. |
| partToExtract : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing the part to extract from the URL. |
| key : :class:`~pyspark.sql.Column` or str, optional |
| A column of strings, each representing the key of a query parameter in the URL. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of strings, each representing the value of the extracted part from the URL. |
| |
| Examples |
| -------- |
| Example 1: Extracting the query part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "QUERY")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.try_parse_url(df.url, df.part)).show() |
| +------------------------+ |
| |try_parse_url(url, part)| |
| +------------------------+ |
| | query=1| |
| +------------------------+ |
| |
| Example 2: Extracting the value of a specific query parameter from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "QUERY", "query")], |
| ... ["url", "part", "key"] |
| ... ) |
| >>> df.select(sf.try_parse_url(df.url, df.part, df.key)).show() |
| +-----------------------------+ |
| |try_parse_url(url, part, key)| |
| +-----------------------------+ |
| | 1| |
| +-----------------------------+ |
| |
| Example 3: Extracting the protocol part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "PROTOCOL")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.try_parse_url(df.url, df.part)).show() |
| +------------------------+ |
| |try_parse_url(url, part)| |
| +------------------------+ |
| | https| |
| +------------------------+ |
| |
| Example 4: Extracting the host part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "HOST")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.try_parse_url(df.url, df.part)).show() |
| +------------------------+ |
| |try_parse_url(url, part)| |
| +------------------------+ |
| | spark.apache.org| |
| +------------------------+ |
| |
| Example 5: Extracting the path part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "PATH")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.try_parse_url(df.url, df.part)).show() |
| +------------------------+ |
| |try_parse_url(url, part)| |
| +------------------------+ |
| | /path| |
| +------------------------+ |
| |
| Example 6: Invalid URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("inva lid://spark.apache.org/path?query=1", "QUERY", "query")], |
| ... ["url", "part", "key"] |
| ... ) |
| >>> df.select(sf.try_parse_url(df.url, df.part, df.key)).show() |
| +-----------------------------+ |
| |try_parse_url(url, part, key)| |
| +-----------------------------+ |
| | NULL| |
| +-----------------------------+ |
| """ |
| if key is not None: |
| return _invoke_function_over_columns("try_parse_url", url, partToExtract, key) |
| else: |
| return _invoke_function_over_columns("try_parse_url", url, partToExtract) |
| |
| |
| @_try_remote_functions |
| def parse_url( |
| url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None |
| ) -> Column: |
| """ |
| URL function: Extracts a specified part from a URL. If a key is provided, |
| it returns the associated query parameter value. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| url : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing a URL. |
| partToExtract : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing the part to extract from the URL. |
| key : :class:`~pyspark.sql.Column` or str, optional |
| A column of strings, each representing the key of a query parameter in the URL. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of strings, each representing the value of the extracted part from the URL. |
| |
| Examples |
| -------- |
| Example 1: Extracting the query part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "QUERY")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.parse_url(df.url, df.part)).show() |
| +--------------------+ |
| |parse_url(url, part)| |
| +--------------------+ |
| | query=1| |
| +--------------------+ |
| |
| Example 2: Extracting the value of a specific query parameter from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "QUERY", "query")], |
| ... ["url", "part", "key"] |
| ... ) |
| >>> df.select(sf.parse_url(df.url, df.part, df.key)).show() |
| +-------------------------+ |
| |parse_url(url, part, key)| |
| +-------------------------+ |
| | 1| |
| +-------------------------+ |
| |
| Example 3: Extracting the protocol part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "PROTOCOL")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.parse_url(df.url, df.part)).show() |
| +--------------------+ |
| |parse_url(url, part)| |
| +--------------------+ |
| | https| |
| +--------------------+ |
| |
| Example 4: Extracting the host part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "HOST")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.parse_url(df.url, df.part)).show() |
| +--------------------+ |
| |parse_url(url, part)| |
| +--------------------+ |
| | spark.apache.org| |
| +--------------------+ |
| |
| Example 5: Extracting the path part from a URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("https://spark.apache.org/path?query=1", "PATH")], |
| ... ["url", "part"] |
| ... ) |
| >>> df.select(sf.parse_url(df.url, df.part)).show() |
| +--------------------+ |
| |parse_url(url, part)| |
| +--------------------+ |
| | /path| |
| +--------------------+ |
| """ |
| if key is not None: |
| return _invoke_function_over_columns("parse_url", url, partToExtract, key) |
| else: |
| return _invoke_function_over_columns("parse_url", url, partToExtract) |
| |
| |
| @_try_remote_functions |
| def printf(format: "ColumnOrName", *cols: "ColumnOrName") -> Column: |
| """ |
| Formats the arguments in printf-style and returns the result as a string column. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| format : :class:`~pyspark.sql.Column` or str |
| string that can contain embedded format tags and used as result column's value |
| cols : :class:`~pyspark.sql.Column` or str |
| column names or :class:`~pyspark.sql.Column`\\s to be used in formatting |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("aa%d%s", 123, "cc",)], ["a", "b", "c"] |
| ... ).select(sf.printf("a", "b", "c")).show() |
| +---------------+ |
| |printf(a, b, c)| |
| +---------------+ |
| | aa123cc| |
| +---------------+ |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function("printf", _to_java_column(format), _to_seq(sc, cols, _to_java_column)) |
| |
| |
| @_try_remote_functions |
| def url_decode(str: "ColumnOrName") -> Column: |
| """ |
| URL function: Decodes a URL-encoded string in 'application/x-www-form-urlencoded' |
| format to its original format. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing a URL-encoded string. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of strings, each representing the decoded string. |
| |
| Examples |
| -------- |
| Example 1: Decoding a URL-encoded string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["url"]) |
| >>> df.select(sf.url_decode(df.url)).show(truncate=False) |
| +------------------------+ |
| |url_decode(url) | |
| +------------------------+ |
| |https://spark.apache.org| |
| +------------------------+ |
| |
| Example 2: Decoding a URL-encoded string with spaces |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Hello%20World%21",)], ["url"]) |
| >>> df.select(sf.url_decode(df.url)).show() |
| +---------------+ |
| |url_decode(url)| |
| +---------------+ |
| | Hello World!| |
| +---------------+ |
| |
| Example 3: Decoding a URL-encoded string with special characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("A%2BB%3D%3D",)], ["url"]) |
| >>> df.select(sf.url_decode(df.url)).show() |
| +---------------+ |
| |url_decode(url)| |
| +---------------+ |
| | A+B==| |
| +---------------+ |
| |
| Example 4: Decoding a URL-encoded string with non-ASCII characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("%E4%BD%A0%E5%A5%BD",)], ["url"]) |
| >>> df.select(sf.url_decode(df.url)).show() |
| +---------------+ |
| |url_decode(url)| |
| +---------------+ |
| | 你好| |
| +---------------+ |
| |
| Example 5: Decoding a URL-encoded string with hexadecimal values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("%7E%21%40%23%24%25%5E%26%2A%28%29%5F%2B",)], ["url"]) |
| >>> df.select(sf.url_decode(df.url)).show() |
| +---------------+ |
| |url_decode(url)| |
| +---------------+ |
| | ~!@#$%^&*()_+| |
| +---------------+ |
| """ |
| return _invoke_function_over_columns("url_decode", str) |
| |
| |
| @_try_remote_functions |
| def try_url_decode(str: "ColumnOrName") -> Column: |
| """ |
| This is a special version of `url_decode` that performs the same operation, but returns a |
| NULL value instead of raising an error if the decoding cannot be performed. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing a URL-encoded string. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of strings, each representing the decoded string. |
| |
| Examples |
| -------- |
| Example 1: Decoding a URL-encoded string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["url"]) |
| >>> df.select(sf.try_url_decode(df.url)).show(truncate=False) |
| +------------------------+ |
| |try_url_decode(url) | |
| +------------------------+ |
| |https://spark.apache.org| |
| +------------------------+ |
| |
| Example 2: Return NULL if the decoding cannot be performed. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("https%3A%2F%2spark.apache.org",)], ["url"]) |
| >>> df.select(sf.try_url_decode(df.url)).show() |
| +-------------------+ |
| |try_url_decode(url)| |
| +-------------------+ |
| | NULL| |
| +-------------------+ |
| """ |
| return _invoke_function_over_columns("try_url_decode", str) |
| |
| |
| @_try_remote_functions |
| def url_encode(str: "ColumnOrName") -> Column: |
| """ |
| URL function: Encodes a string into a URL-encoded string in |
| 'application/x-www-form-urlencoded' format. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A column of strings, each representing a string to be URL-encoded. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of strings, each representing the URL-encoded string. |
| |
| Examples |
| -------- |
| Example 1: Encoding a simple URL |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("https://spark.apache.org",)], ["url"]) |
| >>> df.select(sf.url_encode(df.url)).show(truncate=False) |
| +------------------------------+ |
| |url_encode(url) | |
| +------------------------------+ |
| |https%3A%2F%2Fspark.apache.org| |
| +------------------------------+ |
| |
| Example 2: Encoding a URL with spaces |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Hello World!",)], ["url"]) |
| >>> df.select(sf.url_encode(df.url)).show() |
| +---------------+ |
| |url_encode(url)| |
| +---------------+ |
| | Hello+World%21| |
| +---------------+ |
| |
| Example 3: Encoding a URL with special characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("A+B==",)], ["url"]) |
| >>> df.select(sf.url_encode(df.url)).show() |
| +---------------+ |
| |url_encode(url)| |
| +---------------+ |
| | A%2BB%3D%3D| |
| +---------------+ |
| |
| Example 4: Encoding a URL with non-ASCII characters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("你好",)], ["url"]) |
| >>> df.select(sf.url_encode(df.url)).show() |
| +------------------+ |
| | url_encode(url)| |
| +------------------+ |
| |%E4%BD%A0%E5%A5%BD| |
| +------------------+ |
| |
| Example 5: Encoding a URL with hexadecimal values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("~!@#$%^&*()_+",)], ["url"]) |
| >>> df.select(sf.url_encode(df.url)).show(truncate=False) |
| +-----------------------------------+ |
| |url_encode(url) | |
| +-----------------------------------+ |
| |%7E%21%40%23%24%25%5E%26*%28%29_%2B| |
| +-----------------------------------+ |
| """ |
| return _invoke_function_over_columns("url_encode", str) |
| |
| |
| @_try_remote_functions |
| def position( |
| substr: "ColumnOrName", str: "ColumnOrName", start: Optional["ColumnOrName"] = None |
| ) -> Column: |
| """ |
| Returns the position of the first occurrence of `substr` in `str` after position `start`. |
| The given `start` and return value are 1-based. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| substr : :class:`~pyspark.sql.Column` or str |
| A column of string, substring. |
| str : :class:`~pyspark.sql.Column` or str |
| A column of string. |
| start : :class:`~pyspark.sql.Column` or str, optional |
| A column of string, start position. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [("bar", "foobarbar", 5,)], ["a", "b", "c"] |
| ... ).select(sf.position("a", "b", "c")).show() |
| +-----------------+ |
| |position(a, b, c)| |
| +-----------------+ |
| | 7| |
| +-----------------+ |
| |
| >>> spark.createDataFrame( |
| ... [("bar", "foobarbar", 5,)], ["a", "b", "c"] |
| ... ).select(sf.position("a", "b")).show() |
| +-----------------+ |
| |position(a, b, 1)| |
| +-----------------+ |
| | 4| |
| +-----------------+ |
| """ |
| if start is not None: |
| return _invoke_function_over_columns("position", substr, str, start) |
| else: |
| return _invoke_function_over_columns("position", substr, str) |
| |
| |
| @_try_remote_functions |
| def endswith(str: "ColumnOrName", suffix: "ColumnOrName") -> Column: |
| """ |
| Returns a boolean. The value is True if str ends with suffix. |
| Returns NULL if either input expression is NULL. Otherwise, returns False. |
| Both str or suffix must be of STRING or BINARY type. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A column of string. |
| suffix : :class:`~pyspark.sql.Column` or str |
| A column of string, the suffix. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"]) |
| >>> df.select(endswith(df.a, df.b).alias('r')).collect() |
| [Row(r=False)] |
| |
| >>> df = spark.createDataFrame([("414243", "4243",)], ["e", "f"]) |
| >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f")) |
| >>> df.printSchema() |
| root |
| |-- e: binary (nullable = true) |
| |-- f: binary (nullable = true) |
| >>> df.select(endswith("e", "f"), endswith("f", "e")).show() |
| +--------------+--------------+ |
| |endswith(e, f)|endswith(f, e)| |
| +--------------+--------------+ |
| | true| false| |
| +--------------+--------------+ |
| """ |
| return _invoke_function_over_columns("endswith", str, suffix) |
| |
| |
| @_try_remote_functions |
| def startswith(str: "ColumnOrName", prefix: "ColumnOrName") -> Column: |
| """ |
| Returns a boolean. The value is True if str starts with prefix. |
| Returns NULL if either input expression is NULL. Otherwise, returns False. |
| Both str or prefix must be of STRING or BINARY type. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A column of string. |
| prefix : :class:`~pyspark.sql.Column` or str |
| A column of string, the prefix. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"]) |
| >>> df.select(startswith(df.a, df.b).alias('r')).collect() |
| [Row(r=True)] |
| |
| >>> df = spark.createDataFrame([("414243", "4142",)], ["e", "f"]) |
| >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f")) |
| >>> df.printSchema() |
| root |
| |-- e: binary (nullable = true) |
| |-- f: binary (nullable = true) |
| >>> df.select(startswith("e", "f"), startswith("f", "e")).show() |
| +----------------+----------------+ |
| |startswith(e, f)|startswith(f, e)| |
| +----------------+----------------+ |
| | true| false| |
| +----------------+----------------+ |
| """ |
| return _invoke_function_over_columns("startswith", str, prefix) |
| |
| |
| @_try_remote_functions |
| def char(col: "ColumnOrName") -> Column: |
| """ |
| Returns the ASCII character having the binary equivalent to `col`. If col is larger than 256 the |
| result is equivalent to char(col % 256) |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.char(sf.lit(65))).show() |
| +--------+ |
| |char(65)| |
| +--------+ |
| | A| |
| +--------+ |
| """ |
| return _invoke_function_over_columns("char", col) |
| |
| |
| @_try_remote_functions |
| def btrim(str: "ColumnOrName", trim: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| Remove the leading and trailing `trim` characters from `str`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| trim : :class:`~pyspark.sql.Column` or str, optional |
| The trim string characters to trim, the default value is a single space |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("SSparkSQLS", "SL", )], ['a', 'b']) |
| >>> df.select(btrim(df.a, df.b).alias('r')).collect() |
| [Row(r='parkSQ')] |
| |
| >>> df = spark.createDataFrame([(" SparkSQL ",)], ['a']) |
| >>> df.select(btrim(df.a).alias('r')).collect() |
| [Row(r='SparkSQL')] |
| """ |
| if trim is not None: |
| return _invoke_function_over_columns("btrim", str, trim) |
| else: |
| return _invoke_function_over_columns("btrim", str) |
| |
| |
| @_try_remote_functions |
| def char_length(str: "ColumnOrName") -> Column: |
| """ |
| Returns the character length of string data or number of bytes of binary data. |
| The length of string data includes the trailing spaces. |
| The length of binary data includes binary zeros. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.char_length(sf.lit("SparkSQL"))).show() |
| +---------------------+ |
| |char_length(SparkSQL)| |
| +---------------------+ |
| | 8| |
| +---------------------+ |
| """ |
| return _invoke_function_over_columns("char_length", str) |
| |
| |
| @_try_remote_functions |
| def character_length(str: "ColumnOrName") -> Column: |
| """ |
| Returns the character length of string data or number of bytes of binary data. |
| The length of string data includes the trailing spaces. |
| The length of binary data includes binary zeros. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.character_length(sf.lit("SparkSQL"))).show() |
| +--------------------------+ |
| |character_length(SparkSQL)| |
| +--------------------------+ |
| | 8| |
| +--------------------------+ |
| """ |
| return _invoke_function_over_columns("character_length", str) |
| |
| |
| @_try_remote_functions |
| def chr(n: "ColumnOrName") -> Column: |
| """ |
| Returns the ASCII character having the binary equivalent to `n`. |
| If n is larger than 256 the result is equivalent to chr(n % 256). |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| n : :class:`~pyspark.sql.Column` or column name |
| target column to compute on. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(60, 70).select("*", sf.chr("id")).show() |
| +---+-------+ |
| | id|chr(id)| |
| +---+-------+ |
| | 60| <| |
| | 61| =| |
| | 62| >| |
| | 63| ?| |
| | 64| @| |
| | 65| A| |
| | 66| B| |
| | 67| C| |
| | 68| D| |
| | 69| E| |
| +---+-------+ |
| """ |
| return _invoke_function_over_columns("chr", n) |
| |
| |
| @_try_remote_functions |
| def try_to_binary(col: "ColumnOrName", format: Optional["ColumnOrName"] = None) -> Column: |
| """ |
| This is a special version of `to_binary` that performs the same operation, but returns a NULL |
| value instead of raising an error if the conversion cannot be performed. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or str, optional |
| format to use to convert binary values. |
| |
| Examples |
| -------- |
| Example 1: Convert string to a binary with encoding specified |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("abc",)], ["e"]) |
| >>> df.select(sf.try_to_binary(df.e, sf.lit("utf-8")).alias('r')).collect() |
| [Row(r=bytearray(b'abc'))] |
| |
| Example 2: Convert string to a timestamp without encoding specified |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("414243",)], ["e"]) |
| >>> df.select(sf.try_to_binary(df.e).alias('r')).collect() |
| [Row(r=bytearray(b'ABC'))] |
| |
| Example 3: Converion failure results in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... df = spark.range(1) |
| ... df.select(sf.try_to_binary(sf.lit("malformed"), sf.lit("hex"))).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +-----------------------------+ |
| |try_to_binary(malformed, hex)| |
| +-----------------------------+ |
| | NULL| |
| +-----------------------------+ |
| """ |
| if format is not None: |
| return _invoke_function_over_columns("try_to_binary", col, format) |
| else: |
| return _invoke_function_over_columns("try_to_binary", col) |
| |
| |
| @_try_remote_functions |
| def try_to_number(col: "ColumnOrName", format: "ColumnOrName") -> Column: |
| """ |
| Convert string 'col' to a number based on the string format `format`. Returns NULL if the |
| string 'col' does not match the expected format. The format follows the same semantics as the |
| to_number function. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| format : :class:`~pyspark.sql.Column` or str, optional |
| format to use to convert number values. |
| |
| Examples |
| -------- |
| Example 1: Convert a string to a number with a format specified |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([("$78.12",)], ["e"]) |
| >>> df.select(sf.try_to_number(df.e, sf.lit("$99.99")).alias('r')).show() |
| +-----+ |
| | r| |
| +-----+ |
| |78.12| |
| +-----+ |
| |
| Example 2: Converion failure results in NULL when ANSI mode is on |
| |
| >>> import pyspark.sql.functions as sf |
| >>> origin = spark.conf.get("spark.sql.ansi.enabled") |
| >>> spark.conf.set("spark.sql.ansi.enabled", "true") |
| >>> try: |
| ... df = spark.range(1) |
| ... df.select(sf.try_to_number(sf.lit("77"), sf.lit("$99.99")).alias('r')).show() |
| ... finally: |
| ... spark.conf.set("spark.sql.ansi.enabled", origin) |
| +----+ |
| | r| |
| +----+ |
| |NULL| |
| +----+ |
| """ |
| return _invoke_function_over_columns("try_to_number", col, format) |
| |
| |
| @_try_remote_functions |
| def contains(left: "ColumnOrName", right: "ColumnOrName") -> Column: |
| """ |
| Returns a boolean. The value is True if right is found inside left. |
| Returns NULL if either input expression is NULL. Otherwise, returns False. |
| Both left or right must be of STRING or BINARY type. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or str |
| The input column or strings to check, may be NULL. |
| right : :class:`~pyspark.sql.Column` or str |
| The input column or strings to find, may be NULL. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark SQL", "Spark")], ['a', 'b']) |
| >>> df.select(contains(df.a, df.b).alias('r')).collect() |
| [Row(r=True)] |
| |
| >>> df = spark.createDataFrame([("414243", "4243",)], ["c", "d"]) |
| >>> df = df.select(to_binary("c").alias("c"), to_binary("d").alias("d")) |
| >>> df.printSchema() |
| root |
| |-- c: binary (nullable = true) |
| |-- d: binary (nullable = true) |
| >>> df.select(contains("c", "d"), contains("d", "c")).show() |
| +--------------+--------------+ |
| |contains(c, d)|contains(d, c)| |
| +--------------+--------------+ |
| | true| false| |
| +--------------+--------------+ |
| """ |
| return _invoke_function_over_columns("contains", left, right) |
| |
| |
| @_try_remote_functions |
| def elt(*inputs: "ColumnOrName") -> Column: |
| """ |
| Returns the `n`-th input, e.g., returns `input2` when `n` is 2. |
| The function returns NULL if the index exceeds the length of the array |
| and `spark.sql.ansi.enabled` is set to false. If `spark.sql.ansi.enabled` is set to true, |
| it throws ArrayIndexOutOfBoundsException for invalid indices. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| inputs : :class:`~pyspark.sql.Column` or str |
| Input columns or strings. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, "scala", "java")], ['a', 'b', 'c']) |
| >>> df.select(elt(df.a, df.b, df.c).alias('r')).collect() |
| [Row(r='scala')] |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function("elt", _to_seq(sc, inputs, _to_java_column)) |
| |
| |
| @_try_remote_functions |
| def find_in_set(str: "ColumnOrName", str_array: "ColumnOrName") -> Column: |
| """ |
| Returns the index (1-based) of the given string (`str`) in the comma-delimited |
| list (`strArray`). Returns 0, if the string was not found or if the given string (`str`) |
| contains a comma. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| The given string to be found. |
| str_array : :class:`~pyspark.sql.Column` or str |
| The comma-delimited list. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("ab", "abc,b,ab,c,def")], ['a', 'b']) |
| >>> df.select(find_in_set(df.a, df.b).alias('r')).collect() |
| [Row(r=3)] |
| """ |
| return _invoke_function_over_columns("find_in_set", str, str_array) |
| |
| |
| @_try_remote_functions |
| def like( |
| str: "ColumnOrName", pattern: "ColumnOrName", escapeChar: Optional["Column"] = None |
| ) -> Column: |
| """ |
| Returns true if str matches `pattern` with `escape`, |
| null if any arguments are null, false otherwise. |
| The default escape character is the '\'. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A string. |
| pattern : :class:`~pyspark.sql.Column` or str |
| A string. The pattern is a string which is matched literally, with |
| exception to the following special symbols: |
| _ matches any one character in the input (similar to . in posix regular expressions) |
| % matches zero or more characters in the input (similar to .* in posix regular |
| expressions) |
| Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order |
| to match "\abc", the pattern should be "\\abc". |
| When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back |
| to Spark 1.6 behavior regarding string literal parsing. For example, if the config is |
| enabled, the pattern to match "\abc" should be "\abc". |
| escapeChar : :class:`~pyspark.sql.Column`, optional |
| An character added since Spark 3.0. The default escape character is the '\'. |
| If an escape character precedes a special symbol or another escape character, the |
| following character is matched literally. It is invalid to escape any other character. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b']) |
| >>> df.select(like(df.a, df.b).alias('r')).collect() |
| [Row(r=True)] |
| |
| >>> df = spark.createDataFrame( |
| ... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")], |
| ... ['a', 'b'] |
| ... ) |
| >>> df.select(like(df.a, df.b, lit('/')).alias('r')).collect() |
| [Row(r=True)] |
| """ |
| if escapeChar is not None: |
| return _invoke_function_over_columns("like", str, pattern, escapeChar) |
| else: |
| return _invoke_function_over_columns("like", str, pattern) |
| |
| |
| @_try_remote_functions |
| def ilike( |
| str: "ColumnOrName", pattern: "ColumnOrName", escapeChar: Optional["Column"] = None |
| ) -> Column: |
| """ |
| Returns true if str matches `pattern` with `escape` case-insensitively, |
| null if any arguments are null, false otherwise. |
| The default escape character is the '\'. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| A string. |
| pattern : :class:`~pyspark.sql.Column` or str |
| A string. The pattern is a string which is matched literally, with |
| exception to the following special symbols: |
| _ matches any one character in the input (similar to . in posix regular expressions) |
| % matches zero or more characters in the input (similar to .* in posix regular |
| expressions) |
| Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order |
| to match "\abc", the pattern should be "\\abc". |
| When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back |
| to Spark 1.6 behavior regarding string literal parsing. For example, if the config is |
| enabled, the pattern to match "\abc" should be "\abc". |
| escapeChar : :class:`~pyspark.sql.Column`, optional |
| An character added since Spark 3.0. The default escape character is the '\'. |
| If an escape character precedes a special symbol or another escape character, the |
| following character is matched literally. It is invalid to escape any other character. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b']) |
| >>> df.select(ilike(df.a, df.b).alias('r')).collect() |
| [Row(r=True)] |
| |
| >>> df = spark.createDataFrame( |
| ... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")], |
| ... ['a', 'b'] |
| ... ) |
| >>> df.select(ilike(df.a, df.b, lit('/')).alias('r')).collect() |
| [Row(r=True)] |
| """ |
| if escapeChar is not None: |
| return _invoke_function_over_columns("ilike", str, pattern, escapeChar) |
| else: |
| return _invoke_function_over_columns("ilike", str, pattern) |
| |
| |
| @_try_remote_functions |
| def lcase(str: "ColumnOrName") -> Column: |
| """ |
| Returns `str` with all characters changed to lowercase. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.lcase(sf.lit("Spark"))).show() |
| +------------+ |
| |lcase(Spark)| |
| +------------+ |
| | spark| |
| +------------+ |
| """ |
| return _invoke_function_over_columns("lcase", str) |
| |
| |
| @_try_remote_functions |
| def ucase(str: "ColumnOrName") -> Column: |
| """ |
| Returns `str` with all characters changed to uppercase. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.ucase(sf.lit("Spark"))).show() |
| +------------+ |
| |ucase(Spark)| |
| +------------+ |
| | SPARK| |
| +------------+ |
| """ |
| return _invoke_function_over_columns("ucase", str) |
| |
| |
| @_try_remote_functions |
| def left(str: "ColumnOrName", len: "ColumnOrName") -> Column: |
| """ |
| Returns the leftmost `len`(`len` can be string type) characters from the string `str`, |
| if `len` is less or equal than 0 the result is an empty string. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| len : :class:`~pyspark.sql.Column` or str |
| Input column or strings, the leftmost `len`. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b']) |
| >>> df.select(left(df.a, df.b).alias('r')).collect() |
| [Row(r='Spa')] |
| """ |
| return _invoke_function_over_columns("left", str, len) |
| |
| |
| @_try_remote_functions |
| def right(str: "ColumnOrName", len: "ColumnOrName") -> Column: |
| """ |
| Returns the rightmost `len`(`len` can be string type) characters from the string `str`, |
| if `len` is less or equal than 0 the result is an empty string. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| str : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| len : :class:`~pyspark.sql.Column` or str |
| Input column or strings, the rightmost `len`. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b']) |
| >>> df.select(right(df.a, df.b).alias('r')).collect() |
| [Row(r='SQL')] |
| """ |
| return _invoke_function_over_columns("right", str, len) |
| |
| |
| @_try_remote_functions |
| def mask( |
| col: "ColumnOrName", |
| upperChar: Optional["ColumnOrName"] = None, |
| lowerChar: Optional["ColumnOrName"] = None, |
| digitChar: Optional["ColumnOrName"] = None, |
| otherChar: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Masks the given string value. This can be useful for creating copies of tables with sensitive |
| information removed. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col: :class:`~pyspark.sql.Column` or str |
| target column to compute on. |
| upperChar: :class:`~pyspark.sql.Column` or str, optional |
| character to replace upper-case characters with. Specify NULL to retain original character. |
| lowerChar: :class:`~pyspark.sql.Column` or str, optional |
| character to replace lower-case characters with. Specify NULL to retain original character. |
| digitChar: :class:`~pyspark.sql.Column` or str, optional |
| character to replace digit characters with. Specify NULL to retain original character. |
| otherChar: :class:`~pyspark.sql.Column` or str, optional |
| character to replace all other characters with. Specify NULL to retain original character. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([("AbCD123-@$#",), ("abcd-EFGH-8765-4321",)], ['data']) |
| >>> df.select(mask(df.data).alias('r')).collect() |
| [Row(r='XxXXnnn-@$#'), Row(r='xxxx-XXXX-nnnn-nnnn')] |
| >>> df.select(mask(df.data, lit('Y')).alias('r')).collect() |
| [Row(r='YxYYnnn-@$#'), Row(r='xxxx-YYYY-nnnn-nnnn')] |
| >>> df.select(mask(df.data, lit('Y'), lit('y')).alias('r')).collect() |
| [Row(r='YyYYnnn-@$#'), Row(r='yyyy-YYYY-nnnn-nnnn')] |
| >>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d')).alias('r')).collect() |
| [Row(r='YyYYddd-@$#'), Row(r='yyyy-YYYY-dddd-dddd')] |
| >>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d'), lit('*')).alias('r')).collect() |
| [Row(r='YyYYddd****'), Row(r='yyyy*YYYY*dddd*dddd')] |
| """ |
| |
| _upperChar = lit("X") if upperChar is None else upperChar |
| _lowerChar = lit("x") if lowerChar is None else lowerChar |
| _digitChar = lit("n") if digitChar is None else digitChar |
| _otherChar = lit(None) if otherChar is None else otherChar |
| return _invoke_function_over_columns( |
| "mask", col, _upperChar, _lowerChar, _digitChar, _otherChar |
| ) |
| |
| |
| @_try_remote_functions |
| def collate(col: "ColumnOrName", collation: str) -> Column: |
| """ |
| Marks a given column with specified collation. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Target string column to work on. |
| collation : str |
| Target collation name. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of string type, where each value has the specified collation. |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("collate", _to_java_column(col), _enum_to_value(collation)) |
| |
| |
| @_try_remote_functions |
| def collation(col: "ColumnOrName") -> Column: |
| """ |
| Returns the collation name of a given column. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Target string column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| collation name of a given expression. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([('name',)], ['dt']) |
| >>> df.select(collation('dt').alias('collation')).show(truncate=False) |
| +--------------------------+ |
| |collation | |
| +--------------------------+ |
| |SYSTEM.BUILTIN.UTF8_BINARY| |
| +--------------------------+ |
| """ |
| return _invoke_function_over_columns("collation", col) |
| |
| |
| @_try_remote_functions |
| def quote(col: "ColumnOrName") -> Column: |
| r"""Returns `str` enclosed by single quotes and each instance of |
| single quote in it is preceded by a backslash. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to be quoted. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| quoted string |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame(["Don't"], "STRING") |
| >>> df.select("*", sf.quote("value")).show() |
| +-----+------------+ |
| |value|quote(value)| |
| +-----+------------+ |
| |Don't| 'Don\'t'| |
| +-----+------------+ |
| """ |
| return _invoke_function_over_columns("quote", col) |
| |
| |
| # ---------------------- Collection functions ------------------------------ |
| |
| |
| @overload |
| def create_map(*cols: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def create_map(__cols: Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def create_map( |
| *cols: Union["ColumnOrName", Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]] |
| ) -> Column: |
| """ |
| Map function: Creates a new map column from an even number of input columns or |
| column references. The input columns are grouped into key-value pairs to form a map. |
| For instance, the input (key1, value1, key2, value2, ...) would produce a map that |
| associates key1 with value1, key2 with value2, and so on. The function supports |
| grouping columns as a list as well. |
| |
| .. versionadded:: 2.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or str |
| The input column names or :class:`~pyspark.sql.Column` objects grouped into |
| key-value pairs. These can also be expressed as a list of columns. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column of Map type, where each value is a map formed from the corresponding |
| key-value pairs provided in the input arguments. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of create_map function. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) |
| >>> df.select(sf.create_map('name', 'age')).show() |
| +--------------+ |
| |map(name, age)| |
| +--------------+ |
| | {Alice -> 2}| |
| | {Bob -> 5}| |
| +--------------+ |
| |
| Example 2: Usage of create_map function with a list of columns. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age")) |
| >>> df.select(sf.create_map([df.name, df.age])).show() |
| +--------------+ |
| |map(name, age)| |
| +--------------+ |
| | {Alice -> 2}| |
| | {Bob -> 5}| |
| +--------------+ |
| |
| Example 3: Usage of create_map function with more than one key-value pair. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2, "female"), |
| ... ("Bob", 5, "male")], ("name", "age", "gender")) |
| >>> df.select(sf.create_map(sf.lit('name'), df['name'], |
| ... sf.lit('gender'), df['gender'])).show(truncate=False) |
| +---------------------------------+ |
| |map(name, name, gender, gender) | |
| +---------------------------------+ |
| |{name -> Alice, gender -> female}| |
| |{name -> Bob, gender -> male} | |
| +---------------------------------+ |
| |
| Example 4: Usage of create_map function with values of different types. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", 2, 22.2), |
| ... ("Bob", 5, 36.1)], ("name", "age", "weight")) |
| >>> df.select(sf.create_map(sf.lit('age'), df['age'], |
| ... sf.lit('weight'), df['weight'])).show(truncate=False) |
| +-----------------------------+ |
| |map(age, age, weight, weight)| |
| +-----------------------------+ |
| |{age -> 2.0, weight -> 22.2} | |
| |{age -> 5.0, weight -> 36.1} | |
| +-----------------------------+ |
| """ |
| if len(cols) == 1 and isinstance(cols[0], (list, set)): |
| cols = cols[0] # type: ignore[assignment] |
| return _invoke_function_over_seq_of_columns("map", cols) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def map_from_arrays(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Map function: Creates a new map from two arrays. This function takes two arrays of |
| keys and values respectively, and returns a new map column. |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or str |
| Name of column containing a set of keys. All elements should not be null. |
| col2 : :class:`~pyspark.sql.Column` or str |
| Name of column containing a set of values. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column of map type. |
| |
| Notes |
| ----- |
| The input arrays for keys and values must have the same length and all elements |
| in keys should not be null. If these conditions are not met, an exception will be thrown. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of map_from_arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v']) |
| >>> df.select(sf.map_from_arrays(df.k, df.v)).show() |
| +---------------------+ |
| |map_from_arrays(k, v)| |
| +---------------------+ |
| | {2 -> a, 5 -> b}| |
| +---------------------+ |
| |
| Example 2: map_from_arrays with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2], ['a', None])], ['k', 'v']) |
| >>> df.select(sf.map_from_arrays(df.k, df.v)).show() |
| +---------------------+ |
| |map_from_arrays(k, v)| |
| +---------------------+ |
| | {1 -> a, 2 -> NULL}| |
| +---------------------+ |
| |
| Example 3: map_from_arrays with empty arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField('k', ArrayType(IntegerType())), |
| ... StructField('v', ArrayType(StringType())) |
| ... ]) |
| >>> df = spark.createDataFrame([([], [])], schema=schema) |
| >>> df.select(sf.map_from_arrays(df.k, df.v)).show() |
| +---------------------+ |
| |map_from_arrays(k, v)| |
| +---------------------+ |
| | {}| |
| +---------------------+ |
| """ |
| return _invoke_function_over_columns("map_from_arrays", col1, col2) |
| |
| |
| @overload |
| def array(*cols: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def array(__cols: Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def array( |
| *cols: Union["ColumnOrName", Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]] |
| ) -> Column: |
| """ |
| Collection function: Creates a new array column from the input columns or column names. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or str |
| Column names or :class:`~pyspark.sql.Column` objects that have the same data type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column of array type, where each value is an array containing the corresponding values |
| from the input columns. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of array function with column names. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", "doctor"), ("Bob", "engineer")], |
| ... ("name", "occupation")) |
| >>> df.select(sf.array('name', 'occupation')).show() |
| +-----------------------+ |
| |array(name, occupation)| |
| +-----------------------+ |
| | [Alice, doctor]| |
| | [Bob, engineer]| |
| +-----------------------+ |
| |
| Example 2: Usage of array function with Column objects. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", "doctor"), ("Bob", "engineer")], |
| ... ("name", "occupation")) |
| >>> df.select(sf.array(df.name, df.occupation)).show() |
| +-----------------------+ |
| |array(name, occupation)| |
| +-----------------------+ |
| | [Alice, doctor]| |
| | [Bob, engineer]| |
| +-----------------------+ |
| |
| Example 3: Single argument as list of column names. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", "doctor"), ("Bob", "engineer")], |
| ... ("name", "occupation")) |
| >>> df.select(sf.array(['name', 'occupation'])).show() |
| +-----------------------+ |
| |array(name, occupation)| |
| +-----------------------+ |
| | [Alice, doctor]| |
| | [Bob, engineer]| |
| +-----------------------+ |
| |
| Example 4: Usage of array function with columns of different types. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [("Alice", 2, 22.2), ("Bob", 5, 36.1)], |
| ... ("name", "age", "weight")) |
| >>> df.select(sf.array(['age', 'weight'])).show() |
| +------------------+ |
| |array(age, weight)| |
| +------------------+ |
| | [2.0, 22.2]| |
| | [5.0, 36.1]| |
| +------------------+ |
| |
| Example 5: array function with a column containing null values. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("Alice", None), ("Bob", "engineer")], |
| ... ("name", "occupation")) |
| >>> df.select(sf.array('name', 'occupation')).show() |
| +-----------------------+ |
| |array(name, occupation)| |
| +-----------------------+ |
| | [Alice, NULL]| |
| | [Bob, engineer]| |
| +-----------------------+ |
| """ |
| if len(cols) == 1 and isinstance(cols[0], (list, set)): |
| cols = cols[0] # type: ignore[assignment] |
| return _invoke_function_over_seq_of_columns("array", cols) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def array_contains(col: "ColumnOrName", value: Any) -> Column: |
| """ |
| Collection function: This function returns a boolean indicating whether the array |
| contains the given value, returning null if the array is null, true if the array |
| contains the given value, and false otherwise. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The target column containing the arrays. |
| value : |
| The value or column to check for in the array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column of Boolean type, where each value indicates whether the corresponding array |
| from the input column contains the specified value. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of array_contains function. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) |
| >>> df.select(sf.array_contains(df.data, "a")).show() |
| +-----------------------+ |
| |array_contains(data, a)| |
| +-----------------------+ |
| | true| |
| | false| |
| +-----------------------+ |
| |
| Example 2: Usage of array_contains function with a column. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"], "c"), |
| ... (["c", "d", "e"], "d"), |
| ... (["e", "a", "c"], "b")], ["data", "item"]) |
| >>> df.select(sf.array_contains(df.data, sf.col("item"))).show() |
| +--------------------------+ |
| |array_contains(data, item)| |
| +--------------------------+ |
| | true| |
| | true| |
| | false| |
| +--------------------------+ |
| |
| Example 3: Attempt to use array_contains function with a null array. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None,), (["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.array_contains(df.data, "a")).show() |
| +-----------------------+ |
| |array_contains(data, a)| |
| +-----------------------+ |
| | NULL| |
| | true| |
| +-----------------------+ |
| |
| Example 4: Usage of array_contains with an array column containing null values. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", None, "c"],)], ['data']) |
| >>> df.select(sf.array_contains(df.data, "a")).show() |
| +-----------------------+ |
| |array_contains(data, a)| |
| +-----------------------+ |
| | true| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("array_contains", col, lit(value)) |
| |
| |
| @_try_remote_functions |
| def arrays_overlap(a1: "ColumnOrName", a2: "ColumnOrName") -> Column: |
| """ |
| Collection function: This function returns a boolean column indicating if the input arrays |
| have common non-null elements, returning true if they do, null if the arrays do not contain |
| any common elements but are not empty and at least one of them contains a null element, |
| and false otherwise. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| a1, a2 : :class:`~pyspark.sql.Column` or str |
| The names of the columns that contain the input arrays. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column of Boolean type, where each value indicates whether the corresponding arrays |
| from the input columns contain any common elements. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of arrays_overlap function. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ['x', 'y']) |
| >>> df.select(sf.arrays_overlap(df.x, df.y)).show() |
| +--------------------+ |
| |arrays_overlap(x, y)| |
| +--------------------+ |
| | true| |
| | false| |
| +--------------------+ |
| |
| Example 2: Usage of arrays_overlap function with arrays containing null elements. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", None], ["b", None]), (["a"], ["b", "c"])], ['x', 'y']) |
| >>> df.select(sf.arrays_overlap(df.x, df.y)).show() |
| +--------------------+ |
| |arrays_overlap(x, y)| |
| +--------------------+ |
| | NULL| |
| | false| |
| +--------------------+ |
| |
| Example 3: Usage of arrays_overlap function with arrays that are null. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None, ["b", "c"]), (["a"], None)], ['x', 'y']) |
| >>> df.select(sf.arrays_overlap(df.x, df.y)).show() |
| +--------------------+ |
| |arrays_overlap(x, y)| |
| +--------------------+ |
| | NULL| |
| | NULL| |
| +--------------------+ |
| |
| Example 4: Usage of arrays_overlap on arrays with identical elements. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b"], ["a", "b"]), (["a"], ["a"])], ['x', 'y']) |
| >>> df.select(sf.arrays_overlap(df.x, df.y)).show() |
| +--------------------+ |
| |arrays_overlap(x, y)| |
| +--------------------+ |
| | true| |
| | true| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("arrays_overlap", a1, a2) |
| |
| |
| @_try_remote_functions |
| def slice( |
| x: "ColumnOrName", start: Union["ColumnOrName", int], length: Union["ColumnOrName", int] |
| ) -> Column: |
| """ |
| Array function: Returns a new array column by slicing the input array column from |
| a start index to a specific length. The indices start at 1, and can be negative to index |
| from the end of the array. The length specifies the number of elements in the resulting array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| x : :class:`~pyspark.sql.Column` or str |
| Input array column or column name to be sliced. |
| start : :class:`~pyspark.sql.Column`, str, or int |
| The start index for the slice operation. If negative, starts the index from the |
| end of the array. |
| length : :class:`~pyspark.sql.Column`, str, or int |
| The length of the slice, representing number of elements in the resulting array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new Column object of Array type, where each value is a slice of the corresponding |
| list from the input column. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of the slice function. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) |
| >>> df.select(sf.slice(df.x, 2, 2)).show() |
| +--------------+ |
| |slice(x, 2, 2)| |
| +--------------+ |
| | [2, 3]| |
| | [5]| |
| +--------------+ |
| |
| Example 2: Slicing with negative start index. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) |
| >>> df.select(sf.slice(df.x, -1, 1)).show() |
| +---------------+ |
| |slice(x, -1, 1)| |
| +---------------+ |
| | [3]| |
| | [5]| |
| +---------------+ |
| |
| Example 3: Slice function with column inputs for start and length. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3], 2, 2), ([4, 5], 1, 3)], ['x', 'start', 'length']) |
| >>> df.select(sf.slice(df.x, df.start, df.length)).show() |
| +-----------------------+ |
| |slice(x, start, length)| |
| +-----------------------+ |
| | [2, 3]| |
| | [4, 5]| |
| +-----------------------+ |
| """ |
| start = _enum_to_value(start) |
| start = lit(start) if isinstance(start, int) else start |
| length = _enum_to_value(length) |
| length = lit(length) if isinstance(length, int) else length |
| |
| return _invoke_function_over_columns("slice", x, start, length) |
| |
| |
| @_try_remote_functions |
| def array_join( |
| col: "ColumnOrName", delimiter: str, null_replacement: Optional[str] = None |
| ) -> Column: |
| """ |
| Array function: Returns a string column by concatenating the elements of the input |
| array column using the delimiter. Null values within the array can be replaced with |
| a specified string through the null_replacement argument. If null_replacement is |
| not set, null values are ignored. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The input column containing the arrays to be joined. |
| delimiter : str |
| The string to be used as the delimiter when joining the array elements. |
| null_replacement : str, optional |
| The string to replace null values within the array. If not set, null values are ignored. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of string type, where each value is the result of joining the corresponding |
| array from the input column. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of array_join function. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", "b"],)], ['data']) |
| >>> df.select(sf.array_join(df.data, ",")).show() |
| +-------------------+ |
| |array_join(data, ,)| |
| +-------------------+ |
| | a,b,c| |
| | a,b| |
| +-------------------+ |
| |
| Example 2: Usage of array_join function with null_replacement argument. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", None, "c"],)], ['data']) |
| >>> df.select(sf.array_join(df.data, ",", "NULL")).show() |
| +-------------------------+ |
| |array_join(data, ,, NULL)| |
| +-------------------------+ |
| | a,NULL,c| |
| +-------------------------+ |
| |
| Example 3: Usage of array_join function without null_replacement argument. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", None, "c"],)], ['data']) |
| >>> df.select(sf.array_join(df.data, ",")).show() |
| +-------------------+ |
| |array_join(data, ,)| |
| +-------------------+ |
| | a,c| |
| +-------------------+ |
| |
| Example 4: Usage of array_join function with an array that is null. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, ArrayType, StringType |
| >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) |
| >>> df = spark.createDataFrame([(None,)], schema) |
| >>> df.select(sf.array_join(df.data, ",")).show() |
| +-------------------+ |
| |array_join(data, ,)| |
| +-------------------+ |
| | NULL| |
| +-------------------+ |
| |
| Example 5: Usage of array_join function with an array containing only null values. |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, ArrayType, StringType |
| >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) |
| >>> df = spark.createDataFrame([([None, None],)], schema) |
| >>> df.select(sf.array_join(df.data, ",", "NULL")).show() |
| +-------------------------+ |
| |array_join(data, ,, NULL)| |
| +-------------------------+ |
| | NULL,NULL| |
| +-------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| _get_active_spark_context() |
| if null_replacement is None: |
| return _invoke_function("array_join", _to_java_column(col), _enum_to_value(delimiter)) |
| else: |
| return _invoke_function( |
| "array_join", |
| _to_java_column(col), |
| _enum_to_value(delimiter), |
| _enum_to_value(null_replacement), |
| ) |
| |
| |
| @_try_remote_functions |
| def concat(*cols: "ColumnOrName") -> Column: |
| """ |
| Collection function: Concatenates multiple input columns together into a single column. |
| The function works with strings, numeric, binary and compatible array columns. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or str |
| target column or columns to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| concatenated values. Type of the `Column` depends on input columns' type. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.concat_ws` |
| :meth:`pyspark.sql.functions.array_join` : to concatenate string columns with delimiter |
| |
| Examples |
| -------- |
| Example 1: Concatenating string columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) |
| >>> df.select(sf.concat(df.s, df.d)).show() |
| +------------+ |
| |concat(s, d)| |
| +------------+ |
| | abcd123| |
| +------------+ |
| |
| Example 2: Concatenating array columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c']) |
| >>> df.select(sf.concat(df.a, df.b, df.c)).show() |
| +---------------+ |
| |concat(a, b, c)| |
| +---------------+ |
| |[1, 2, 3, 4, 5]| |
| | NULL| |
| +---------------+ |
| |
| Example 3: Concatenating numeric columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c']) |
| >>> df.select(sf.concat(df.a, df.b, df.c)).show() |
| +---------------+ |
| |concat(a, b, c)| |
| +---------------+ |
| | 123| |
| +---------------+ |
| |
| Example 4: Concatenating binary columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(bytearray(b'abc'), bytearray(b'def'))], ['a', 'b']) |
| >>> df.select(sf.concat(df.a, df.b)).show() |
| +-------------------+ |
| | concat(a, b)| |
| +-------------------+ |
| |[61 62 63 64 65 66]| |
| +-------------------+ |
| |
| Example 5: Concatenating mixed types of columns |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,"abc",3,"def")], ['a','b','c','d']) |
| >>> df.select(sf.concat(df.a, df.b, df.c, df.d)).show() |
| +------------------+ |
| |concat(a, b, c, d)| |
| +------------------+ |
| | 1abc3def| |
| +------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("concat", cols) |
| |
| |
| @_try_remote_functions |
| def array_position(col: "ColumnOrName", value: Any) -> Column: |
| """ |
| Array function: Locates the position of the first occurrence of the given value |
| in the given array. Returns null if either of the arguments are null. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Notes |
| ----- |
| The position is not zero based, but 1 based index. Returns 0 if the given |
| value could not be found in the array. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target column to work on. |
| value : Any |
| value or a :class:`~pyspark.sql.Column` expression to look for. |
| |
| .. versionchanged:: 4.0.0 |
| `value` now also accepts a Column type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| position of the value in the given array if found and 0 otherwise. |
| |
| Examples |
| -------- |
| Example 1: Finding the position of a string in an array of strings |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["c", "b", "a"],)], ['data']) |
| >>> df.select(sf.array_position(df.data, "a")).show() |
| +-----------------------+ |
| |array_position(data, a)| |
| +-----------------------+ |
| | 3| |
| +-----------------------+ |
| |
| Example 2: Finding the position of a string in an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.array_position(df.data, "a")).show() |
| +-----------------------+ |
| |array_position(data, a)| |
| +-----------------------+ |
| | 0| |
| +-----------------------+ |
| |
| Example 3: Finding the position of an integer in an array of integers |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_position(df.data, 2)).show() |
| +-----------------------+ |
| |array_position(data, 2)| |
| +-----------------------+ |
| | 2| |
| +-----------------------+ |
| |
| Example 4: Finding the position of a non-existing value in an array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["c", "b", "a"],)], ['data']) |
| >>> df.select(sf.array_position(df.data, "d")).show() |
| +-----------------------+ |
| |array_position(data, d)| |
| +-----------------------+ |
| | 0| |
| +-----------------------+ |
| |
| Example 5: Finding the position of a value in an array with nulls |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([None, "b", "a"],)], ['data']) |
| >>> df.select(sf.array_position(df.data, "a")).show() |
| +-----------------------+ |
| |array_position(data, a)| |
| +-----------------------+ |
| | 3| |
| +-----------------------+ |
| |
| Example 6: Finding the position of a column's value in an array of integers |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([10, 20, 30], 20)], ['data', 'col']) |
| >>> df.select(sf.array_position(df.data, df.col)).show() |
| +-------------------------+ |
| |array_position(data, col)| |
| +-------------------------+ |
| | 2| |
| +-------------------------+ |
| |
| """ |
| return _invoke_function_over_columns("array_position", col, lit(value)) |
| |
| |
| @_try_remote_functions |
| def element_at(col: "ColumnOrName", extraction: Any) -> Column: |
| """ |
| Collection function: |
| (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will |
| throw an error. If index < 0, accesses elements from the last to the first. |
| If 'spark.sql.ansi.enabled' is set to true, an exception will be thrown if the index is out |
| of array boundaries instead of returning NULL. |
| |
| (map, key) - Returns value for given key in `extraction` if col is map. The function always |
| returns NULL if the key is not contained in the map. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column containing array or map |
| extraction : |
| index to check for in array or key to check for in map |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| value at given position. |
| |
| Notes |
| ----- |
| The position is not zero based, but 1 based index. |
| If extraction is a string, :meth:`element_at` treats it as a literal string, |
| while :meth:`try_element_at` treats it as a column name. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.get` |
| :meth:`pyspark.sql.functions.try_element_at` |
| |
| Examples |
| -------- |
| Example 1: Getting the first element of an array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.element_at(df.data, 1)).show() |
| +-------------------+ |
| |element_at(data, 1)| |
| +-------------------+ |
| | a| |
| +-------------------+ |
| |
| Example 2: Getting the last element of an array using negative index |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.element_at(df.data, -1)).show() |
| +--------------------+ |
| |element_at(data, -1)| |
| +--------------------+ |
| | c| |
| +--------------------+ |
| |
| Example 3: Getting a value from a map using a key |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) |
| >>> df.select(sf.element_at(df.data, sf.lit("a"))).show() |
| +-------------------+ |
| |element_at(data, a)| |
| +-------------------+ |
| | 1.0| |
| +-------------------+ |
| |
| Example 4: Getting a non-existing value from a map using a key |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) |
| >>> df.select(sf.element_at(df.data, sf.lit("c"))).show() |
| +-------------------+ |
| |element_at(data, c)| |
| +-------------------+ |
| | NULL| |
| +-------------------+ |
| |
| Example 5: Getting a value from a map using a literal string as the key |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0}, "a")], ['data', 'b']) |
| >>> df.select(sf.element_at(df.data, 'b')).show() |
| +-------------------+ |
| |element_at(data, b)| |
| +-------------------+ |
| | 2.0| |
| +-------------------+ |
| """ |
| return _invoke_function_over_columns("element_at", col, lit(extraction)) |
| |
| |
| @_try_remote_functions |
| def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column: |
| """ |
| Collection function: |
| (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will |
| throw an error. If index < 0, accesses elements from the last to the first. The function |
| always returns NULL if the index exceeds the length of the array. |
| |
| (map, key) - Returns value for given key. The function always returns NULL if the key is not |
| contained in the map. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column containing array or map |
| extraction : |
| index to check for in array or key to check for in map |
| |
| Notes |
| ----- |
| The position is not zero based, but 1 based index. |
| If extraction is a string, :meth:`try_element_at` treats it as a column name, |
| while :meth:`element_at` treats it as a literal string. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.get` |
| :meth:`pyspark.sql.functions.element_at` |
| |
| Examples |
| -------- |
| Example 1: Getting the first element of an array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.try_element_at(df.data, sf.lit(1))).show() |
| +-----------------------+ |
| |try_element_at(data, 1)| |
| +-----------------------+ |
| | a| |
| +-----------------------+ |
| |
| Example 2: Getting the last element of an array using negative index |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.try_element_at(df.data, sf.lit(-1))).show() |
| +------------------------+ |
| |try_element_at(data, -1)| |
| +------------------------+ |
| | c| |
| +------------------------+ |
| |
| Example 3: Getting a value from a map using a key |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) |
| >>> df.select(sf.try_element_at(df.data, sf.lit("a"))).show() |
| +-----------------------+ |
| |try_element_at(data, a)| |
| +-----------------------+ |
| | 1.0| |
| +-----------------------+ |
| |
| Example 4: Getting a non-existing element from an array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.try_element_at(df.data, sf.lit(4))).show() |
| +-----------------------+ |
| |try_element_at(data, 4)| |
| +-----------------------+ |
| | NULL| |
| +-----------------------+ |
| |
| Example 5: Getting a non-existing value from a map using a key |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) |
| >>> df.select(sf.try_element_at(df.data, sf.lit("c"))).show() |
| +-----------------------+ |
| |try_element_at(data, c)| |
| +-----------------------+ |
| | NULL| |
| +-----------------------+ |
| |
| Example 6: Getting a value from a map using a column name as the key |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0}, "a")], ['data', 'b']) |
| >>> df.select(sf.try_element_at(df.data, 'b')).show() |
| +-----------------------+ |
| |try_element_at(data, b)| |
| +-----------------------+ |
| | 1.0| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("try_element_at", col, extraction) |
| |
| |
| @_try_remote_functions |
| def get(col: "ColumnOrName", index: Union["ColumnOrName", int]) -> Column: |
| """ |
| Array function: Returns the element of an array at the given (0-based) index. |
| If the index points outside of the array boundaries, then this function |
| returns NULL. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of the column containing the array. |
| index : :class:`~pyspark.sql.Column` or str or int |
| Index to check for in the array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Value at the given position. |
| |
| Notes |
| ----- |
| The position is not 1-based, but 0-based index. |
| Supports Spark Connect. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.element_at` |
| |
| Examples |
| -------- |
| Example 1: Getting an element at a fixed position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.get(df.data, 1)).show() |
| +------------+ |
| |get(data, 1)| |
| +------------+ |
| | b| |
| +------------+ |
| |
| Example 2: Getting an element at a position outside the array boundaries |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) |
| >>> df.select(sf.get(df.data, 3)).show() |
| +------------+ |
| |get(data, 3)| |
| +------------+ |
| | NULL| |
| +------------+ |
| |
| Example 3: Getting an element at a position specified by another column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"], 2)], ['data', 'index']) |
| >>> df.select(sf.get(df.data, df.index)).show() |
| +----------------+ |
| |get(data, index)| |
| +----------------+ |
| | c| |
| +----------------+ |
| |
| |
| Example 4: Getting an element at a position calculated from another column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"], 2)], ['data', 'index']) |
| >>> df.select(sf.get(df.data, df.index - 1)).show() |
| +----------------------+ |
| |get(data, (index - 1))| |
| +----------------------+ |
| | b| |
| +----------------------+ |
| |
| Example 5: Getting an element at a negative position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(["a", "b", "c"], )], ['data']) |
| >>> df.select(sf.get(df.data, -1)).show() |
| +-------------+ |
| |get(data, -1)| |
| +-------------+ |
| | NULL| |
| +-------------+ |
| """ |
| index = _enum_to_value(index) |
| index = lit(index) if isinstance(index, int) else index |
| |
| return _invoke_function_over_columns("get", col, index) |
| |
| |
| @_try_remote_functions |
| def array_prepend(col: "ColumnOrName", value: Any) -> Column: |
| """ |
| Array function: Returns an array containing the given element as |
| the first element and the rest of the elements from the original array. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column containing array |
| value : |
| a literal value, or a :class:`~pyspark.sql.Column` expression. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| an array with the given value prepended. |
| |
| Examples |
| -------- |
| Example 1: Prepending a column value to an array column |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")]) |
| >>> df.select(sf.array_prepend(df.c1, df.c2)).show() |
| +---------------------+ |
| |array_prepend(c1, c2)| |
| +---------------------+ |
| | [c, b, a, c]| |
| +---------------------+ |
| |
| Example 2: Prepending a numeric value to an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_prepend(df.data, 4)).show() |
| +----------------------+ |
| |array_prepend(data, 4)| |
| +----------------------+ |
| | [4, 1, 2, 3]| |
| +----------------------+ |
| |
| Example 3: Prepending a null value to an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_prepend(df.data, None)).show() |
| +-------------------------+ |
| |array_prepend(data, NULL)| |
| +-------------------------+ |
| | [NULL, 1, 2, 3]| |
| +-------------------------+ |
| |
| Example 4: Prepending a value to a NULL array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([(None,)], schema=schema) |
| >>> df.select(sf.array_prepend(df.data, 4)).show() |
| +----------------------+ |
| |array_prepend(data, 4)| |
| +----------------------+ |
| | NULL| |
| +----------------------+ |
| |
| Example 5: Prepending a value to an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.array_prepend(df.data, 1)).show() |
| +----------------------+ |
| |array_prepend(data, 1)| |
| +----------------------+ |
| | [1]| |
| +----------------------+ |
| """ |
| return _invoke_function_over_columns("array_prepend", col, lit(value)) |
| |
| |
| @_try_remote_functions |
| def array_remove(col: "ColumnOrName", element: Any) -> Column: |
| """ |
| Array function: Remove all elements that equal to element from the given array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column containing array |
| element : |
| element or a :class:`~pyspark.sql.Column` expression to be removed from the array |
| |
| .. versionchanged:: 4.0.0 |
| `element` now also accepts a Column type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that is an array excluding the given value from the input column. |
| |
| Examples |
| -------- |
| Example 1: Removing a specific value from a simple array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3, 1, 1],)], ['data']) |
| >>> df.select(sf.array_remove(df.data, 1)).show() |
| +---------------------+ |
| |array_remove(data, 1)| |
| +---------------------+ |
| | [2, 3]| |
| +---------------------+ |
| |
| Example 2: Removing a specific value from multiple arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([4, 5, 5, 4],)], ['data']) |
| >>> df.select(sf.array_remove(df.data, 5)).show() |
| +---------------------+ |
| |array_remove(data, 5)| |
| +---------------------+ |
| | [1, 2, 3, 1, 1]| |
| | [4, 4]| |
| +---------------------+ |
| |
| Example 3: Removing a value that does not exist in the array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_remove(df.data, 4)).show() |
| +---------------------+ |
| |array_remove(data, 4)| |
| +---------------------+ |
| | [1, 2, 3]| |
| +---------------------+ |
| |
| Example 4: Removing a value from an array with all identical values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 1, 1],)], ['data']) |
| >>> df.select(sf.array_remove(df.data, 1)).show() |
| +---------------------+ |
| |array_remove(data, 1)| |
| +---------------------+ |
| | []| |
| +---------------------+ |
| |
| Example 5: Removing a value from an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema) |
| >>> df.select(sf.array_remove(df.data, 1)).show() |
| +---------------------+ |
| |array_remove(data, 1)| |
| +---------------------+ |
| | []| |
| +---------------------+ |
| |
| Example 6: Removing a column's value from a simple array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3, 1, 1], 1)], ['data', 'col']) |
| >>> df.select(sf.array_remove(df.data, df.col)).show() |
| +-----------------------+ |
| |array_remove(data, col)| |
| +-----------------------+ |
| | [2, 3]| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("array_remove", col, lit(element)) |
| |
| |
| @_try_remote_functions |
| def array_distinct(col: "ColumnOrName") -> Column: |
| """ |
| Array function: removes duplicate values from the array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that is an array of unique values from the input column. |
| |
| Examples |
| -------- |
| Example 1: Removing duplicate values from a simple array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3, 2],)], ['data']) |
| >>> df.select(sf.array_distinct(df.data)).show() |
| +--------------------+ |
| |array_distinct(data)| |
| +--------------------+ |
| | [1, 2, 3]| |
| +--------------------+ |
| |
| Example 2: Removing duplicate values from multiple arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data']) |
| >>> df.select(sf.array_distinct(df.data)).show() |
| +--------------------+ |
| |array_distinct(data)| |
| +--------------------+ |
| | [1, 2, 3]| |
| | [4, 5]| |
| +--------------------+ |
| |
| Example 3: Removing duplicate values from an array with all identical values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 1, 1],)], ['data']) |
| >>> df.select(sf.array_distinct(df.data)).show() |
| +--------------------+ |
| |array_distinct(data)| |
| +--------------------+ |
| | [1]| |
| +--------------------+ |
| |
| Example 4: Removing duplicate values from an array with no duplicate values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_distinct(df.data)).show() |
| +--------------------+ |
| |array_distinct(data)| |
| +--------------------+ |
| | [1, 2, 3]| |
| +--------------------+ |
| |
| Example 5: Removing duplicate values from an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema) |
| >>> df.select(sf.array_distinct(df.data)).show() |
| +--------------------+ |
| |array_distinct(data)| |
| +--------------------+ |
| | []| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("array_distinct", col) |
| |
| |
| @_try_remote_functions |
| def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: Any) -> Column: |
| """ |
| Array function: Inserts an item into a given array at a specified array index. |
| Array indices start at 1, or start from the end if index is negative. |
| Index above array size appends the array, or prepends the array if index is negative, |
| with 'null' elements. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| arr : :class:`~pyspark.sql.Column` or str |
| name of column containing an array |
| pos : :class:`~pyspark.sql.Column` or str or int |
| name of Numeric type column indicating position of insertion |
| (starting at index 1, negative position is a start from the back of the array) |
| value : |
| a literal value, or a :class:`~pyspark.sql.Column` expression. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| an array of values, including the new specified value |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| Example 1: Inserting a value at a specific position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) |
| >>> df.select(sf.array_insert(df.data, 2, 'd')).show() |
| +------------------------+ |
| |array_insert(data, 2, d)| |
| +------------------------+ |
| | [a, d, b, c]| |
| +------------------------+ |
| |
| Example 2: Inserting a value at a negative position |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) |
| >>> df.select(sf.array_insert(df.data, -2, 'd')).show() |
| +-------------------------+ |
| |array_insert(data, -2, d)| |
| +-------------------------+ |
| | [a, b, d, c]| |
| +-------------------------+ |
| |
| Example 3: Inserting a value at a position greater than the array size |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) |
| >>> df.select(sf.array_insert(df.data, 5, 'e')).show() |
| +------------------------+ |
| |array_insert(data, 5, e)| |
| +------------------------+ |
| | [a, b, c, NULL, e]| |
| +------------------------+ |
| |
| Example 4: Inserting a NULL value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) |
| >>> df.select(sf.array_insert(df.data, 2, sf.lit(None))).show() |
| +---------------------------+ |
| |array_insert(data, 2, NULL)| |
| +---------------------------+ |
| | [a, NULL, b, c]| |
| +---------------------------+ |
| |
| Example 5: Inserting a value into a NULL array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([StructField("data", ArrayType(IntegerType()), True)]) |
| >>> df = spark.createDataFrame([(None,)], schema=schema) |
| >>> df.select(sf.array_insert(df.data, 1, 5)).show() |
| +------------------------+ |
| |array_insert(data, 1, 5)| |
| +------------------------+ |
| | NULL| |
| +------------------------+ |
| """ |
| pos = _enum_to_value(pos) |
| pos = lit(pos) if isinstance(pos, int) else pos |
| |
| return _invoke_function_over_columns("array_insert", arr, pos, lit(value)) |
| |
| |
| @_try_remote_functions |
| def array_intersect(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Array function: returns a new array containing the intersection of elements in col1 and col2, |
| without duplicates. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or str |
| Name of column containing the first array. |
| col2 : :class:`~pyspark.sql.Column` or str |
| Name of column containing the second array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new array containing the intersection of elements in col1 and col2. |
| |
| Notes |
| ----- |
| This function does not preserve the order of the elements in the input arrays. |
| |
| Examples |
| -------- |
| Example 1: Basic usage |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) |
| >>> df.select(sf.sort_array(sf.array_intersect(df.c1, df.c2))).show() |
| +-----------------------------------------+ |
| |sort_array(array_intersect(c1, c2), true)| |
| +-----------------------------------------+ |
| | [a, c]| |
| +-----------------------------------------+ |
| |
| Example 2: Intersection with no common elements |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["d", "e", "f"])]) |
| >>> df.select(sf.array_intersect(df.c1, df.c2)).show() |
| +-----------------------+ |
| |array_intersect(c1, c2)| |
| +-----------------------+ |
| | []| |
| +-----------------------+ |
| |
| Example 3: Intersection with all common elements |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["a", "b", "c"], c2=["a", "b", "c"])]) |
| >>> df.select(sf.sort_array(sf.array_intersect(df.c1, df.c2))).show() |
| +-----------------------------------------+ |
| |sort_array(array_intersect(c1, c2), true)| |
| +-----------------------------------------+ |
| | [a, b, c]| |
| +-----------------------------------------+ |
| |
| Example 4: Intersection with null values |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["a", "b", None], c2=["a", None, "c"])]) |
| >>> df.select(sf.sort_array(sf.array_intersect(df.c1, df.c2))).show() |
| +-----------------------------------------+ |
| |sort_array(array_intersect(c1, c2), true)| |
| +-----------------------------------------+ |
| | [NULL, a]| |
| +-----------------------------------------+ |
| |
| Example 5: Intersection with empty arrays |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> data = [Row(c1=[], c2=["a", "b", "c"])] |
| >>> schema = StructType([ |
| ... StructField("c1", ArrayType(StringType()), True), |
| ... StructField("c2", ArrayType(StringType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame(data, schema) |
| >>> df.select(sf.array_intersect(df.c1, df.c2)).show() |
| +-----------------------+ |
| |array_intersect(c1, c2)| |
| +-----------------------+ |
| | []| |
| +-----------------------+ |
| """ |
| return _invoke_function_over_columns("array_intersect", col1, col2) |
| |
| |
| @_try_remote_functions |
| def array_union(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Array function: returns a new array containing the union of elements in col1 and col2, |
| without duplicates. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or str |
| Name of column containing the first array. |
| col2 : :class:`~pyspark.sql.Column` or str |
| Name of column containing the second array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new array containing the union of elements in col1 and col2. |
| |
| Notes |
| ----- |
| This function does not preserve the order of the elements in the input arrays. |
| |
| Examples |
| -------- |
| Example 1: Basic usage |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) |
| >>> df.select(sf.sort_array(sf.array_union(df.c1, df.c2))).show() |
| +-------------------------------------+ |
| |sort_array(array_union(c1, c2), true)| |
| +-------------------------------------+ |
| | [a, b, c, d, f]| |
| +-------------------------------------+ |
| |
| Example 2: Union with no common elements |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["d", "e", "f"])]) |
| >>> df.select(sf.sort_array(sf.array_union(df.c1, df.c2))).show() |
| +-------------------------------------+ |
| |sort_array(array_union(c1, c2), true)| |
| +-------------------------------------+ |
| | [a, b, c, d, e, f]| |
| +-------------------------------------+ |
| |
| Example 3: Union with all common elements |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["a", "b", "c"], c2=["a", "b", "c"])]) |
| >>> df.select(sf.sort_array(sf.array_union(df.c1, df.c2))).show() |
| +-------------------------------------+ |
| |sort_array(array_union(c1, c2), true)| |
| +-------------------------------------+ |
| | [a, b, c]| |
| +-------------------------------------+ |
| |
| Example 4: Union with null values |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["a", "b", None], c2=["a", None, "c"])]) |
| >>> df.select(sf.sort_array(sf.array_union(df.c1, df.c2))).show() |
| +-------------------------------------+ |
| |sort_array(array_union(c1, c2), true)| |
| +-------------------------------------+ |
| | [NULL, a, b, c]| |
| +-------------------------------------+ |
| |
| Example 5: Union with empty arrays |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> data = [Row(c1=[], c2=["a", "b", "c"])] |
| >>> schema = StructType([ |
| ... StructField("c1", ArrayType(StringType()), True), |
| ... StructField("c2", ArrayType(StringType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame(data, schema) |
| >>> df.select(sf.sort_array(sf.array_union(df.c1, df.c2))).show() |
| +-------------------------------------+ |
| |sort_array(array_union(c1, c2), true)| |
| +-------------------------------------+ |
| | [a, b, c]| |
| +-------------------------------------+ |
| """ |
| return _invoke_function_over_columns("array_union", col1, col2) |
| |
| |
| @_try_remote_functions |
| def array_except(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Array function: returns a new array containing the elements present in col1 but not in col2, |
| without duplicates. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or str |
| Name of column containing the first array. |
| col2 : :class:`~pyspark.sql.Column` or str |
| Name of column containing the second array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new array containing the elements present in col1 but not in col2. |
| |
| Notes |
| ----- |
| This function does not preserve the order of the elements in the input arrays. |
| |
| Examples |
| -------- |
| Example 1: Basic usage |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])]) |
| >>> df.select(sf.array_except(df.c1, df.c2)).show() |
| +--------------------+ |
| |array_except(c1, c2)| |
| +--------------------+ |
| | [b]| |
| +--------------------+ |
| |
| Example 2: Except with no common elements |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["d", "e", "f"])]) |
| >>> df.select(sf.sort_array(sf.array_except(df.c1, df.c2))).show() |
| +--------------------------------------+ |
| |sort_array(array_except(c1, c2), true)| |
| +--------------------------------------+ |
| | [a, b, c]| |
| +--------------------------------------+ |
| |
| Example 3: Except with all common elements |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["a", "b", "c"], c2=["a", "b", "c"])]) |
| >>> df.select(sf.array_except(df.c1, df.c2)).show() |
| +--------------------+ |
| |array_except(c1, c2)| |
| +--------------------+ |
| | []| |
| +--------------------+ |
| |
| Example 4: Except with null values |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["a", "b", None], c2=["a", None, "c"])]) |
| >>> df.select(sf.array_except(df.c1, df.c2)).show() |
| +--------------------+ |
| |array_except(c1, c2)| |
| +--------------------+ |
| | [b]| |
| +--------------------+ |
| |
| Example 5: Except with empty arrays |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> data = [Row(c1=[], c2=["a", "b", "c"])] |
| >>> schema = StructType([ |
| ... StructField("c1", ArrayType(StringType()), True), |
| ... StructField("c2", ArrayType(StringType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame(data, schema) |
| >>> df.select(sf.array_except(df.c1, df.c2)).show() |
| +--------------------+ |
| |array_except(c1, c2)| |
| +--------------------+ |
| | []| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("array_except", col1, col2) |
| |
| |
| @_try_remote_functions |
| def array_compact(col: "ColumnOrName") -> Column: |
| """ |
| Array function: removes null values from the array. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that is an array excluding the null values from the input column. |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| Example 1: Removing null values from a simple array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, None, 2, 3],)], ['data']) |
| >>> df.select(sf.array_compact(df.data)).show() |
| +-------------------+ |
| |array_compact(data)| |
| +-------------------+ |
| | [1, 2, 3]| |
| +-------------------+ |
| |
| Example 2: Removing null values from multiple arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], ['data']) |
| >>> df.select(sf.array_compact(df.data)).show() |
| +-------------------+ |
| |array_compact(data)| |
| +-------------------+ |
| | [1, 2, 3]| |
| | [4, 5, 4]| |
| +-------------------+ |
| |
| Example 3: Removing null values from an array with all null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(StringType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([None, None, None],)], schema) |
| >>> df.select(sf.array_compact(df.data)).show() |
| +-------------------+ |
| |array_compact(data)| |
| +-------------------+ |
| | []| |
| +-------------------+ |
| |
| Example 4: Removing null values from an array with no null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_compact(df.data)).show() |
| +-------------------+ |
| |array_compact(data)| |
| +-------------------+ |
| | [1, 2, 3]| |
| +-------------------+ |
| |
| Example 5: Removing null values from an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(StringType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema) |
| >>> df.select(sf.array_compact(df.data)).show() |
| +-------------------+ |
| |array_compact(data)| |
| +-------------------+ |
| | []| |
| +-------------------+ |
| """ |
| return _invoke_function_over_columns("array_compact", col) |
| |
| |
| @_try_remote_functions |
| def array_append(col: "ColumnOrName", value: Any) -> Column: |
| """ |
| Array function: returns a new array column by appending `value` to the existing array `col`. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column containing the array. |
| value : |
| A literal value, or a :class:`~pyspark.sql.Column` expression to be appended to the array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new array column with `value` appended to the original array. |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| Example 1: Appending a column value to an array column |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")]) |
| >>> df.select(sf.array_append(df.c1, df.c2)).show() |
| +--------------------+ |
| |array_append(c1, c2)| |
| +--------------------+ |
| | [b, a, c, c]| |
| +--------------------+ |
| |
| Example 2: Appending a numeric value to an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_append(df.data, 4)).show() |
| +---------------------+ |
| |array_append(data, 4)| |
| +---------------------+ |
| | [1, 2, 3, 4]| |
| +---------------------+ |
| |
| Example 3: Appending a null value to an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) |
| >>> df.select(sf.array_append(df.data, None)).show() |
| +------------------------+ |
| |array_append(data, NULL)| |
| +------------------------+ |
| | [1, 2, 3, NULL]| |
| +------------------------+ |
| |
| Example 4: Appending a value to a NULL array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([(None,)], schema=schema) |
| >>> df.select(sf.array_append(df.data, 4)).show() |
| +---------------------+ |
| |array_append(data, 4)| |
| +---------------------+ |
| | NULL| |
| +---------------------+ |
| |
| Example 5: Appending a value to an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.array_append(df.data, 1)).show() |
| +---------------------+ |
| |array_append(data, 1)| |
| +---------------------+ |
| | [1]| |
| +---------------------+ |
| """ |
| return _invoke_function_over_columns("array_append", col, lit(value)) |
| |
| |
| @_try_remote_functions |
| def explode(col: "ColumnOrName") -> Column: |
| """ |
| Returns a new row for each element in the given array or map. |
| Uses the default column name `col` for elements in the array and |
| `key` and `value` for elements in the map unless specified otherwise. |
| |
| .. versionadded:: 1.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| One row per array item or map key value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.posexplode` |
| :meth:`pyspark.sql.functions.explode_outer` |
| :meth:`pyspark.sql.functions.posexplode_outer` |
| :meth:`pyspark.sql.functions.inline` |
| :meth:`pyspark.sql.functions.inline_outer` |
| |
| Notes |
| ----- |
| Only one explode is allowed per SELECT clause. |
| |
| Examples |
| -------- |
| Example 1: Exploding an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') |
| >>> df.show() |
| +---+---------------+ |
| | i| a| |
| +---+---------------+ |
| | 1|[1, 2, 3, NULL]| |
| | 2| []| |
| | 3| NULL| |
| +---+---------------+ |
| |
| >>> df.select('*', sf.explode('a')).show() |
| +---+---------------+----+ |
| | i| a| col| |
| +---+---------------+----+ |
| | 1|[1, 2, 3, NULL]| 1| |
| | 1|[1, 2, 3, NULL]| 2| |
| | 1|[1, 2, 3, NULL]| 3| |
| | 1|[1, 2, 3, NULL]|NULL| |
| +---+---------------+----+ |
| |
| Example 2: Exploding a map column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') |
| >>> df.show(truncate=False) |
| +---+---------------------------+ |
| |i |m | |
| +---+---------------------------+ |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}| |
| |2 |{} | |
| |3 |NULL | |
| +---+---------------------------+ |
| |
| >>> df.select('*', sf.explode('m')).show(truncate=False) |
| +---+---------------------------+---+-----+ |
| |i |m |key|value| |
| +---+---------------------------+---+-----+ |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |2 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|3 |4 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|5 |NULL | |
| +---+---------------------------+---+-----+ |
| |
| Example 3: Exploding multiple array columns |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT ARRAY(1,2) AS a1, ARRAY(3,4,5) AS a2') |
| >>> df.select( |
| ... '*', sf.explode('a1').alias('v1') |
| ... ).select('*', sf.explode('a2').alias('v2')).show() |
| +------+---------+---+---+ |
| | a1| a2| v1| v2| |
| +------+---------+---+---+ |
| |[1, 2]|[3, 4, 5]| 1| 3| |
| |[1, 2]|[3, 4, 5]| 1| 4| |
| |[1, 2]|[3, 4, 5]| 1| 5| |
| |[1, 2]|[3, 4, 5]| 2| 3| |
| |[1, 2]|[3, 4, 5]| 2| 4| |
| |[1, 2]|[3, 4, 5]| 2| 5| |
| +------+---------+---+---+ |
| |
| Example 4: Exploding an array of struct column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') |
| >>> df.select(sf.explode('a').alias("s")).select("s.*").show() |
| +---+---+ |
| | a| b| |
| +---+---+ |
| | 1| 2| |
| | 3| 4| |
| +---+---+ |
| """ # noqa: E501 |
| return _invoke_function_over_columns("explode", col) |
| |
| |
| @_try_remote_functions |
| def posexplode(col: "ColumnOrName") -> Column: |
| """ |
| Returns a new row for each element with position in the given array or map. |
| Uses the default column name `pos` for position, and `col` for elements in the |
| array and `key` and `value` for elements in the map unless specified otherwise. |
| |
| .. versionadded:: 2.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| one row per array item or map key value including positions as a separate column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.explode` |
| :meth:`pyspark.sql.functions.explode_outer` |
| :meth:`pyspark.sql.functions.posexplode_outer` |
| :meth:`pyspark.sql.functions.inline` |
| :meth:`pyspark.sql.functions.inline_outer` |
| |
| Examples |
| -------- |
| Example 1: Exploding an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') |
| >>> df.show() |
| +---+---------------+ |
| | i| a| |
| +---+---------------+ |
| | 1|[1, 2, 3, NULL]| |
| | 2| []| |
| | 3| NULL| |
| +---+---------------+ |
| |
| >>> df.select('*', sf.posexplode('a')).show() |
| +---+---------------+---+----+ |
| | i| a|pos| col| |
| +---+---------------+---+----+ |
| | 1|[1, 2, 3, NULL]| 0| 1| |
| | 1|[1, 2, 3, NULL]| 1| 2| |
| | 1|[1, 2, 3, NULL]| 2| 3| |
| | 1|[1, 2, 3, NULL]| 3|NULL| |
| +---+---------------+---+----+ |
| |
| Example 2: Exploding a map column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') |
| >>> df.show(truncate=False) |
| +---+---------------------------+ |
| |i |m | |
| +---+---------------------------+ |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}| |
| |2 |{} | |
| |3 |NULL | |
| +---+---------------------------+ |
| |
| >>> df.select('*', sf.posexplode('m')).show(truncate=False) |
| +---+---------------------------+---+---+-----+ |
| |i |m |pos|key|value| |
| +---+---------------------------+---+---+-----+ |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|0 |1 |2 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |3 |4 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|2 |5 |NULL | |
| +---+---------------------------+---+---+-----+ |
| """ # noqa: E501 |
| return _invoke_function_over_columns("posexplode", col) |
| |
| |
| @_try_remote_functions |
| def inline(col: "ColumnOrName") -> Column: |
| """ |
| Explodes an array of structs into a table. |
| |
| This function takes an input column containing an array of structs and returns a |
| new column where each struct in the array is exploded into a separate row. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| Input column of values to explode. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Generator expression with the inline exploded result. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.explode` |
| :meth:`pyspark.sql.functions.explode_outer` |
| :meth:`pyspark.sql.functions.posexplode` |
| :meth:`pyspark.sql.functions.posexplode_outer` |
| :meth:`pyspark.sql.functions.inline_outer` |
| |
| Examples |
| -------- |
| Example 1: Using inline with a single struct array column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') |
| >>> df.select('*', sf.inline(df.a)).show() |
| +----------------+---+---+ |
| | a| a| b| |
| +----------------+---+---+ |
| |[{1, 2}, {3, 4}]| 1| 2| |
| |[{1, 2}, {3, 4}]| 3| 4| |
| +----------------+---+---+ |
| |
| Example 2: Using inline with a column name |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') |
| >>> df.select('*', sf.inline('a')).show() |
| +----------------+---+---+ |
| | a| a| b| |
| +----------------+---+---+ |
| |[{1, 2}, {3, 4}]| 1| 2| |
| |[{1, 2}, {3, 4}]| 3| 4| |
| +----------------+---+---+ |
| |
| Example 3: Using inline with an alias |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a') |
| >>> df.select('*', sf.inline('a').alias("c1", "c2")).show() |
| +----------------+---+---+ |
| | a| c1| c2| |
| +----------------+---+---+ |
| |[{1, 2}, {3, 4}]| 1| 2| |
| |[{1, 2}, {3, 4}]| 3| 4| |
| +----------------+---+---+ |
| |
| Example 4: Using inline with multiple struct array columns |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT ARRAY(NAMED_STRUCT("a",1,"b",2), NAMED_STRUCT("a",3,"b",4)) AS a1, ARRAY(NAMED_STRUCT("c",5,"d",6), NAMED_STRUCT("c",7,"d",8)) AS a2') |
| >>> df.select( |
| ... '*', sf.inline('a1') |
| ... ).select('*', sf.inline('a2')).show() |
| +----------------+----------------+---+---+---+---+ |
| | a1| a2| a| b| c| d| |
| +----------------+----------------+---+---+---+---+ |
| |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 1| 2| 5| 6| |
| |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 1| 2| 7| 8| |
| |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 3| 4| 5| 6| |
| |[{1, 2}, {3, 4}]|[{5, 6}, {7, 8}]| 3| 4| 7| 8| |
| +----------------+----------------+---+---+---+---+ |
| |
| Example 5: Using inline with a nested struct array column |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql('SELECT NAMED_STRUCT("a",1,"b",2,"c",ARRAY(NAMED_STRUCT("c",3,"d",4), NAMED_STRUCT("c",5,"d",6))) AS s') |
| >>> df.select('*', sf.inline('s.c')).show(truncate=False) |
| +------------------------+---+---+ |
| |s |c |d | |
| +------------------------+---+---+ |
| |{1, 2, [{3, 4}, {5, 6}]}|3 |4 | |
| |{1, 2, [{3, 4}, {5, 6}]}|5 |6 | |
| +------------------------+---+---+ |
| |
| Example 6: Using inline with a column containing: array continaing null, empty array and null |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(NAMED_STRUCT("a",1,"b",2), NULL, NAMED_STRUCT("a",3,"b",4))), (2,ARRAY()), (3,NULL) AS t(i,s)') |
| >>> df.show(truncate=False) |
| +---+----------------------+ |
| |i |s | |
| +---+----------------------+ |
| |1 |[{1, 2}, NULL, {3, 4}]| |
| |2 |[] | |
| |3 |NULL | |
| +---+----------------------+ |
| |
| >>> df.select('*', sf.inline('s')).show(truncate=False) |
| +---+----------------------+----+----+ |
| |i |s |a |b | |
| +---+----------------------+----+----+ |
| |1 |[{1, 2}, NULL, {3, 4}]|1 |2 | |
| |1 |[{1, 2}, NULL, {3, 4}]|NULL|NULL| |
| |1 |[{1, 2}, NULL, {3, 4}]|3 |4 | |
| +---+----------------------+----+----+ |
| """ # noqa: E501 |
| return _invoke_function_over_columns("inline", col) |
| |
| |
| @_try_remote_functions |
| def explode_outer(col: "ColumnOrName") -> Column: |
| """ |
| Returns a new row for each element in the given array or map. |
| Unlike explode, if the array/map is null or empty then null is produced. |
| Uses the default column name `col` for elements in the array and |
| `key` and `value` for elements in the map unless specified otherwise. |
| |
| .. versionadded:: 2.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| one row per array item or map key value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.explode` |
| :meth:`pyspark.sql.functions.posexplode` |
| :meth:`pyspark.sql.functions.posexplode_outer` |
| :meth:`pyspark.sql.functions.inline` |
| :meth:`pyspark.sql.functions.inline_outer` |
| |
| Examples |
| -------- |
| Example 1: Using an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') |
| >>> df.select('*', sf.explode_outer('a')).show() |
| +---+---------------+----+ |
| | i| a| col| |
| +---+---------------+----+ |
| | 1|[1, 2, 3, NULL]| 1| |
| | 1|[1, 2, 3, NULL]| 2| |
| | 1|[1, 2, 3, NULL]| 3| |
| | 1|[1, 2, 3, NULL]|NULL| |
| | 2| []|NULL| |
| | 3| NULL|NULL| |
| +---+---------------+----+ |
| |
| Example 2: Using a map column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') |
| >>> df.select('*', sf.explode_outer('m')).show(truncate=False) |
| +---+---------------------------+----+-----+ |
| |i |m |key |value| |
| +---+---------------------------+----+-----+ |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |2 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|3 |4 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|5 |NULL | |
| |2 |{} |NULL|NULL | |
| |3 |NULL |NULL|NULL | |
| +---+---------------------------+----+-----+ |
| """ # noqa: E501 |
| return _invoke_function_over_columns("explode_outer", col) |
| |
| |
| @_try_remote_functions |
| def posexplode_outer(col: "ColumnOrName") -> Column: |
| """ |
| Returns a new row for each element with position in the given array or map. |
| Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced. |
| Uses the default column name `pos` for position, and `col` for elements in the |
| array and `key` and `value` for elements in the map unless specified otherwise. |
| |
| .. versionadded:: 2.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| target column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| one row per array item or map key value including positions as a separate column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.explode` |
| :meth:`pyspark.sql.functions.explode_outer` |
| :meth:`pyspark.sql.functions.posexplode` |
| :meth:`pyspark.sql.functions.inline` |
| :meth:`pyspark.sql.functions.inline_outer` |
| |
| Examples |
| -------- |
| Example 1: Using an array column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(1,2,3,NULL)), (2,ARRAY()), (3,NULL) AS t(i,a)') |
| >>> df.select('*', sf.posexplode_outer('a')).show() |
| +---+---------------+----+----+ |
| | i| a| pos| col| |
| +---+---------------+----+----+ |
| | 1|[1, 2, 3, NULL]| 0| 1| |
| | 1|[1, 2, 3, NULL]| 1| 2| |
| | 1|[1, 2, 3, NULL]| 2| 3| |
| | 1|[1, 2, 3, NULL]| 3|NULL| |
| | 2| []|NULL|NULL| |
| | 3| NULL|NULL|NULL| |
| +---+---------------+----+----+ |
| |
| Example 2: Using a map column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,MAP(1,2,3,4,5,NULL)), (2,MAP()), (3,NULL) AS t(i,m)') |
| >>> df.select('*', sf.posexplode_outer('m')).show(truncate=False) |
| +---+---------------------------+----+----+-----+ |
| |i |m |pos |key |value| |
| +---+---------------------------+----+----+-----+ |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|0 |1 |2 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|1 |3 |4 | |
| |1 |{1 -> 2, 3 -> 4, 5 -> NULL}|2 |5 |NULL | |
| |2 |{} |NULL|NULL|NULL | |
| |3 |NULL |NULL|NULL|NULL | |
| +---+---------------------------+----+----+-----+ |
| """ # noqa: E501 |
| return _invoke_function_over_columns("posexplode_outer", col) |
| |
| |
| @_try_remote_functions |
| def inline_outer(col: "ColumnOrName") -> Column: |
| """ |
| Explodes an array of structs into a table. |
| Unlike inline, if the array is null or empty then null is produced for each nested column. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| input column of values to explode. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| generator expression with the inline exploded result. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.explode` |
| :meth:`pyspark.sql.functions.explode_outer` |
| :meth:`pyspark.sql.functions.posexplode` |
| :meth:`pyspark.sql.functions.posexplode_outer` |
| :meth:`pyspark.sql.functions.inline` |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql('SELECT * FROM VALUES (1,ARRAY(NAMED_STRUCT("a",1,"b",2), NULL, NAMED_STRUCT("a",3,"b",4))), (2,ARRAY()), (3,NULL) AS t(i,s)') |
| >>> df.printSchema() |
| root |
| |-- i: integer (nullable = false) |
| |-- s: array (nullable = true) |
| | |-- element: struct (containsNull = true) |
| | | |-- a: integer (nullable = false) |
| | | |-- b: integer (nullable = false) |
| |
| >>> df.select('*', sf.inline_outer('s')).show(truncate=False) |
| +---+----------------------+----+----+ |
| |i |s |a |b | |
| +---+----------------------+----+----+ |
| |1 |[{1, 2}, NULL, {3, 4}]|1 |2 | |
| |1 |[{1, 2}, NULL, {3, 4}]|NULL|NULL| |
| |1 |[{1, 2}, NULL, {3, 4}]|3 |4 | |
| |2 |[] |NULL|NULL| |
| |3 |NULL |NULL|NULL| |
| +---+----------------------+----+----+ |
| """ # noqa: E501 |
| return _invoke_function_over_columns("inline_outer", col) |
| |
| |
| @_try_remote_functions |
| def get_json_object(col: "ColumnOrName", path: str) -> Column: |
| """ |
| Extracts json object from a json string based on json `path` specified, and returns json string |
| of the extracted json object. It will return null if the input json string is invalid. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| string column in json format |
| path : str |
| path to the json object to extract |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| string representation of given JSON object value. |
| |
| Examples |
| -------- |
| Example 1: Extract a json object from json string |
| |
| >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] |
| >>> df = spark.createDataFrame(data, ("key", "jstring")) |
| >>> df.select(df.key, |
| ... get_json_object(df.jstring, '$.f1').alias("c0"), |
| ... get_json_object(df.jstring, '$.f2').alias("c1") |
| ... ).show() |
| +---+-------+------+ |
| |key| c0| c1| |
| +---+-------+------+ |
| | 1| value1|value2| |
| | 2|value12| NULL| |
| +---+-------+------+ |
| |
| Example 2: Extract a json object from json array |
| |
| >>> data = [ |
| ... ("1", '''[{"f1": "value1"},{"f1": "value2"}]'''), |
| ... ("2", '''[{"f1": "value12"},{"f2": "value13"}]''') |
| ... ] |
| >>> df = spark.createDataFrame(data, ("key", "jarray")) |
| >>> df.select(df.key, |
| ... get_json_object(df.jarray, '$[0].f1').alias("c0"), |
| ... get_json_object(df.jarray, '$[1].f2').alias("c1") |
| ... ).show() |
| +---+-------+-------+ |
| |key| c0| c1| |
| +---+-------+-------+ |
| | 1| value1| NULL| |
| | 2|value12|value13| |
| +---+-------+-------+ |
| |
| >>> df.select(df.key, |
| ... get_json_object(df.jarray, '$[*].f1').alias("c0"), |
| ... get_json_object(df.jarray, '$[*].f2').alias("c1") |
| ... ).show() |
| +---+-------------------+---------+ |
| |key| c0| c1| |
| +---+-------------------+---------+ |
| | 1|["value1","value2"]| NULL| |
| | 2| "value12"|"value13"| |
| +---+-------------------+---------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("get_json_object", _to_java_column(col), _enum_to_value(path)) |
| |
| |
| @_try_remote_functions |
| def json_tuple(col: "ColumnOrName", *fields: str) -> Column: |
| """Creates a new row for a json column according to the given field names. |
| |
| .. versionadded:: 1.6.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| string column in json format |
| fields : str |
| a field or fields to extract |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new row for each given field value from json object |
| |
| Examples |
| -------- |
| >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')] |
| >>> df = spark.createDataFrame(data, ("key", "jstring")) |
| >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect() |
| [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)] |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| if len(fields) == 0: |
| raise PySparkValueError( |
| errorClass="CANNOT_BE_EMPTY", |
| messageParameters={"item": "field"}, |
| ) |
| sc = _get_active_spark_context() |
| return _invoke_function("json_tuple", _to_java_column(col), _to_seq(sc, fields)) |
| |
| |
| @_try_remote_functions |
| def from_json( |
| col: "ColumnOrName", |
| schema: Union[ArrayType, StructType, Column, str], |
| options: Optional[Mapping[str, str]] = None, |
| ) -> Column: |
| """ |
| Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType` |
| as keys type, :class:`StructType` or :class:`ArrayType` with |
| the specified schema. Returns `null`, in the case of an unparsable string. |
| |
| .. versionadded:: 2.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column or column name in JSON format |
| schema : :class:`DataType` or str |
| a StructType, ArrayType of StructType or Python string literal with a DDL-formatted string |
| to use when parsing the json column |
| options : dict, optional |
| options to control parsing. accepts the same options as the json datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new column of complex type from given JSON object. |
| |
| Examples |
| -------- |
| Example 1: Parsing JSON with a specified schema |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, IntegerType |
| >>> schema = StructType([StructField("a", IntegerType())]) |
| >>> df = spark.createDataFrame([(1, '''{"a": 1}''')], ("key", "value")) |
| >>> df.select(sf.from_json(df.value, schema).alias("json")).show() |
| +----+ |
| |json| |
| +----+ |
| | {1}| |
| +----+ |
| |
| Example 2: Parsing JSON with a DDL-formatted string. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, '''{"a": 1}''')], ("key", "value")) |
| >>> df.select(sf.from_json(df.value, "a INT").alias("json")).show() |
| +----+ |
| |json| |
| +----+ |
| | {1}| |
| +----+ |
| |
| Example 3: Parsing JSON into a MapType |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, '''{"a": 1}''')], ("key", "value")) |
| >>> df.select(sf.from_json(df.value, "MAP<STRING,INT>").alias("json")).show() |
| +--------+ |
| | json| |
| +--------+ |
| |{a -> 1}| |
| +--------+ |
| |
| Example 4: Parsing JSON into an ArrayType of StructType |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType |
| >>> schema = ArrayType(StructType([StructField("a", IntegerType())])) |
| >>> df = spark.createDataFrame([(1, '''[{"a": 1}]''')], ("key", "value")) |
| >>> df.select(sf.from_json(df.value, schema).alias("json")).show() |
| +-----+ |
| | json| |
| +-----+ |
| |[{1}]| |
| +-----+ |
| |
| Example 5: Parsing JSON into an ArrayType |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType |
| >>> schema = ArrayType(IntegerType()) |
| >>> df = spark.createDataFrame([(1, '''[1, 2, 3]''')], ("key", "value")) |
| >>> df.select(sf.from_json(df.value, schema).alias("json")).show() |
| +---------+ |
| | json| |
| +---------+ |
| |[1, 2, 3]| |
| +---------+ |
| |
| Example 6: Parsing JSON with specified options |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, '''{a:123}'''), (2, '''{"a":456}''')], ("key", "value")) |
| >>> parsed1 = sf.from_json(df.value, "a INT") |
| >>> parsed2 = sf.from_json(df.value, "a INT", {"allowUnquotedFieldNames": "true"}) |
| >>> df.select("value", parsed1, parsed2).show() |
| +---------+----------------+----------------+ |
| | value|from_json(value)|from_json(value)| |
| +---------+----------------+----------------+ |
| | {a:123}| {NULL}| {123}| |
| |{"a":456}| {456}| {456}| |
| +---------+----------------+----------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if isinstance(schema, DataType): |
| schema = schema.json() |
| elif isinstance(schema, Column): |
| schema = _to_java_column(schema) |
| return _invoke_function("from_json", _to_java_column(col), schema, _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def try_parse_json( |
| col: "ColumnOrName", |
| ) -> Column: |
| """ |
| Parses a column containing a JSON string into a :class:`VariantType`. Returns None if a string |
| contains an invalid JSON value. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column or column name JSON formatted strings |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new column of VariantType. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''}, {'json': '''{a : 1}'''} ]) |
| >>> df.select(to_json(try_parse_json(df.json))).collect() |
| [Row(to_json(try_parse_json(json))='{"a":1}'), Row(to_json(try_parse_json(json))=None)] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("try_parse_json", _to_java_column(col)) |
| |
| |
| @_try_remote_functions |
| def to_variant_object( |
| col: "ColumnOrName", |
| ) -> Column: |
| """ |
| Converts a column containing nested inputs (array/map/struct) into a variants where maps and |
| structs are converted to variant objects which are unordered unlike SQL structs. Input maps can |
| only have string keys. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column with a nested schema or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new column of VariantType. |
| |
| Examples |
| -------- |
| Example 1: Converting an array containing a nested struct into a variant |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StructType, StructField, StringType, MapType |
| >>> schema = StructType([ |
| ... StructField("i", StringType(), True), |
| ... StructField("v", ArrayType(StructType([ |
| ... StructField("a", MapType(StringType(), StringType()), True) |
| ... ]), True)) |
| ... ]) |
| >>> data = [("1", [{"a": {"b": 2}}])] |
| >>> df = spark.createDataFrame(data, schema) |
| >>> df.select(sf.to_variant_object(df.v)) |
| DataFrame[to_variant_object(v): variant] |
| >>> df.select(sf.to_variant_object(df.v)).show(truncate=False) |
| +--------------------+ |
| |to_variant_object(v)| |
| +--------------------+ |
| |[{"a":{"b":"2"}}] | |
| +--------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("to_variant_object", _to_java_column(col)) |
| |
| |
| @_try_remote_functions |
| def parse_json( |
| col: "ColumnOrName", |
| ) -> Column: |
| """ |
| Parses a column containing a JSON string into a :class:`VariantType`. Throws exception if a |
| string represents an invalid JSON value. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column or column name JSON formatted strings |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new column of VariantType. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) |
| >>> df.select(to_json(parse_json(df.json))).collect() |
| [Row(to_json(parse_json(json))='{"a":1}')] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("parse_json", _to_java_column(col)) |
| |
| |
| @_try_remote_functions |
| def is_variant_null(v: "ColumnOrName") -> Column: |
| """ |
| Check if a variant value is a variant null. Returns true if and only if the input is a variant |
| null and false otherwise (including in the case of SQL NULL). |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| v : :class:`~pyspark.sql.Column` or str |
| a variant column or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a boolean column indicating whether the variant value is a variant null |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) |
| >>> df.select(is_variant_null(parse_json(df.json)).alias("r")).collect() |
| [Row(r=False)] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("is_variant_null", _to_java_column(v)) |
| |
| |
| @_try_remote_functions |
| def variant_get(v: "ColumnOrName", path: Union[Column, str], targetType: str) -> Column: |
| """ |
| Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to |
| `targetType`. Returns null if the path does not exist. Throws an exception if the cast fails. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| v : :class:`~pyspark.sql.Column` or str |
| a variant column or column name |
| path : :class:`~pyspark.sql.Column` or str |
| a column containing the extraction path strings or a string representing the extraction |
| path. A valid path should start with `$` and is followed by zero or more segments like |
| `[123]`, `.name`, `['name']`, or `["name"]`. |
| targetType : str |
| the target data type to cast into, in a DDL-formatted string |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a column of `targetType` representing the extracted result |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }''', 'path': '$.a'} ]) |
| >>> df.select(variant_get(parse_json(df.json), "$.a", "int").alias("r")).collect() |
| [Row(r=1)] |
| >>> df.select(variant_get(parse_json(df.json), "$.b", "int").alias("r")).collect() |
| [Row(r=None)] |
| >>> df.select(variant_get(parse_json(df.json), df.path, "int").alias("r")).collect() |
| [Row(r=1)] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| assert isinstance(path, (Column, str)) |
| if isinstance(path, str): |
| return _invoke_function( |
| "variant_get", _to_java_column(v), _enum_to_value(path), _enum_to_value(targetType) |
| ) |
| else: |
| return _invoke_function( |
| "variant_get", _to_java_column(v), _to_java_column(path), _enum_to_value(targetType) |
| ) |
| |
| |
| @_try_remote_functions |
| def try_variant_get(v: "ColumnOrName", path: Union[Column, str], targetType: str) -> Column: |
| """ |
| Extracts a sub-variant from `v` according to `path`, and then cast the sub-variant to |
| `targetType`. Returns null if the path does not exist or the cast fails. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| v : :class:`~pyspark.sql.Column` or str |
| a variant column or column name |
| path : :class:`~pyspark.sql.Column` or str |
| a column containing the extraction path strings or a string representing the extraction |
| path. A valid path should start with `$` and is followed by zero or more segments like |
| `[123]`, `.name`, `['name']`, or `["name"]`. |
| targetType : str |
| the target data type to cast into, in a DDL-formatted string |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a column of `targetType` representing the extracted result |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }''', 'path': '$.a'} ]) |
| >>> df.select(try_variant_get(parse_json(df.json), "$.a", "int").alias("r")).collect() |
| [Row(r=1)] |
| >>> df.select(try_variant_get(parse_json(df.json), "$.b", "int").alias("r")).collect() |
| [Row(r=None)] |
| >>> df.select(try_variant_get(parse_json(df.json), "$.a", "binary").alias("r")).collect() |
| [Row(r=None)] |
| >>> df.select(try_variant_get(parse_json(df.json), df.path, "int").alias("r")).collect() |
| [Row(r=1)] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if isinstance(path, str): |
| return _invoke_function( |
| "try_variant_get", _to_java_column(v), _enum_to_value(path), _enum_to_value(targetType) |
| ) |
| else: |
| return _invoke_function( |
| "try_variant_get", _to_java_column(v), _to_java_column(path), _enum_to_value(targetType) |
| ) |
| |
| |
| @_try_remote_functions |
| def schema_of_variant(v: "ColumnOrName") -> Column: |
| """ |
| Returns schema in the SQL format of a variant. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| v : :class:`~pyspark.sql.Column` or str |
| a variant column or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a string column representing the variant schema |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) |
| >>> df.select(schema_of_variant(parse_json(df.json)).alias("r")).collect() |
| [Row(r='OBJECT<a: BIGINT>')] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("schema_of_variant", _to_java_column(v)) |
| |
| |
| @_try_remote_functions |
| def schema_of_variant_agg(v: "ColumnOrName") -> Column: |
| """ |
| Returns the merged schema in the SQL format of a variant column. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| v : :class:`~pyspark.sql.Column` or str |
| a variant column or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a string column representing the variant schema |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([ {'json': '''{ "a" : 1 }'''} ]) |
| >>> df.select(schema_of_variant_agg(parse_json(df.json)).alias("r")).collect() |
| [Row(r='OBJECT<a: BIGINT>')] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("schema_of_variant_agg", _to_java_column(v)) |
| |
| |
| @_try_remote_functions |
| def to_json(col: "ColumnOrName", options: Optional[Mapping[str, str]] = None) -> Column: |
| """ |
| Converts a column containing a :class:`StructType`, :class:`ArrayType`, :class:`MapType` |
| or a :class:`VariantType` into a JSON string. Throws an exception, in the case of an unsupported type. |
| |
| .. versionadded:: 2.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column containing a struct, an array, a map, or a variant object. |
| options : dict, optional |
| options to control converting. accepts the same options as the JSON datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_ |
| for the version you use. |
| Additionally the function supports the `pretty` option which enables |
| pretty JSON generation. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| JSON object as string column. |
| |
| Examples |
| -------- |
| Example 1: Converting a StructType column to JSON |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from pyspark.sql import Row |
| >>> data = [(1, Row(age=2, name='Alice'))] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> df.select(sf.to_json(df.value).alias("json")).show(truncate=False) |
| +------------------------+ |
| |json | |
| +------------------------+ |
| |{"age":2,"name":"Alice"}| |
| +------------------------+ |
| |
| Example 2: Converting an ArrayType column to JSON |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from pyspark.sql import Row |
| >>> data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> df.select(sf.to_json(df.value).alias("json")).show(truncate=False) |
| +-------------------------------------------------+ |
| |json | |
| +-------------------------------------------------+ |
| |[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]| |
| +-------------------------------------------------+ |
| |
| Example 3: Converting a MapType column to JSON |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, {"name": "Alice"})], ("key", "value")) |
| >>> df.select(sf.to_json(df.value).alias("json")).show(truncate=False) |
| +----------------+ |
| |json | |
| +----------------+ |
| |{"name":"Alice"}| |
| +----------------+ |
| |
| Example 4: Converting a VariantType column to JSON |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, '{"name": "Alice"}')], ("key", "value")) |
| >>> df.select(sf.to_json(sf.parse_json(df.value)).alias("json")).show(truncate=False) |
| +----------------+ |
| |json | |
| +----------------+ |
| |{"name":"Alice"}| |
| +----------------+ |
| |
| Example 5: Converting a nested MapType column to JSON |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, [{"name": "Alice"}, {"name": "Bob"}])], ("key", "value")) |
| >>> df.select(sf.to_json(df.value).alias("json")).show(truncate=False) |
| +---------------------------------+ |
| |json | |
| +---------------------------------+ |
| |[{"name":"Alice"},{"name":"Bob"}]| |
| +---------------------------------+ |
| |
| Example 6: Converting a simple ArrayType column to JSON |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(1, ["Alice", "Bob"])], ("key", "value")) |
| >>> df.select(sf.to_json(df.value).alias("json")).show(truncate=False) |
| +---------------+ |
| |json | |
| +---------------+ |
| |["Alice","Bob"]| |
| +---------------+ |
| |
| Example 7: Converting to JSON with specified options |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT (DATE('2022-02-22'), 1) AS date") |
| >>> json1 = sf.to_json(df.date) |
| >>> json2 = sf.to_json(df.date, {"dateFormat": "yyyy/MM/dd"}) |
| >>> df.select("date", json1, json2).show(truncate=False) |
| +---------------+------------------------------+------------------------------+ |
| |date |to_json(date) |to_json(date) | |
| +---------------+------------------------------+------------------------------+ |
| |{2022-02-22, 1}|{"col1":"2022-02-22","col2":1}|{"col1":"2022/02/22","col2":1}| |
| +---------------+------------------------------+------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("to_json", _to_java_column(col), _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def schema_of_json(json: Union[Column, str], options: Optional[Mapping[str, str]] = None) -> Column: |
| """ |
| Parses a JSON string and infers its schema in DDL format. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| json : :class:`~pyspark.sql.Column` or str |
| a JSON string or a foldable string column containing a JSON string. |
| options : dict, optional |
| options to control parsing. accepts the same options as the JSON datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| .. versionchanged:: 3.0.0 |
| It accepts `options` parameter to control schema inferring. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a string representation of a :class:`StructType` parsed from given JSON. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> parsed1 = sf.schema_of_json(sf.lit('{"a": 0}')) |
| >>> parsed2 = sf.schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'}) |
| >>> spark.range(1).select(parsed1, parsed2).show() |
| +------------------------+----------------------+ |
| |schema_of_json({"a": 0})|schema_of_json({a: 1})| |
| +------------------------+----------------------+ |
| | STRUCT<a: BIGINT>| STRUCT<a: BIGINT>| |
| +------------------------+----------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| json = _enum_to_value(json) |
| if not isinstance(json, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": "json", "arg_type": type(json).__name__}, |
| ) |
| |
| return _invoke_function("schema_of_json", _to_java_column(lit(json)), _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def json_array_length(col: "ColumnOrName") -> Column: |
| """ |
| Returns the number of elements in the outermost JSON array. `NULL` is returned in case of |
| any other valid JSON string, `NULL` or an invalid JSON. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col: :class:`~pyspark.sql.Column` or str |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| length of json array. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(None,), ('[1, 2, 3]',), ('[]',)], ['data']) |
| >>> df.select(json_array_length(df.data).alias('r')).collect() |
| [Row(r=None), Row(r=3), Row(r=0)] |
| """ |
| return _invoke_function_over_columns("json_array_length", col) |
| |
| |
| @_try_remote_functions |
| def json_object_keys(col: "ColumnOrName") -> Column: |
| """ |
| Returns all the keys of the outermost JSON object as an array. If a valid JSON object is |
| given, all the keys of the outermost object will be returned as an array. If it is any |
| other valid JSON string, an invalid JSON string or an empty string, the function returns null. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col: :class:`~pyspark.sql.Column` or str |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| all the keys of the outermost JSON object. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(None,), ('{}',), ('{"key1":1, "key2":2}',)], ['data']) |
| >>> df.select(json_object_keys(df.data).alias('r')).collect() |
| [Row(r=None), Row(r=[]), Row(r=['key1', 'key2'])] |
| """ |
| return _invoke_function_over_columns("json_object_keys", col) |
| |
| |
| # TODO: Fix and add an example for StructType with Spark Connect |
| # e.g., StructType([StructField("a", IntegerType())]) |
| @_try_remote_functions |
| def from_xml( |
| col: "ColumnOrName", |
| schema: Union[StructType, Column, str], |
| options: Optional[Mapping[str, str]] = None, |
| ) -> Column: |
| """ |
| Parses a column containing a XML string to a row with |
| the specified schema. Returns `null`, in the case of an unparsable string. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| a column or column name in XML format |
| schema : :class:`StructType`, :class:`~pyspark.sql.Column` or str |
| a StructType, Column or Python string literal with a DDL-formatted string |
| to use when parsing the Xml column |
| options : dict, optional |
| options to control parsing. accepts the same options as the Xml datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new column of complex type from given XML object. |
| |
| Examples |
| -------- |
| Example 1: Parsing XML with a DDL-formatted string schema |
| |
| >>> import pyspark.sql.functions as sf |
| >>> data = [(1, '''<p><a>1</a></p>''')] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| ... # Define the schema using a DDL-formatted string |
| >>> schema = "STRUCT<a: BIGINT>" |
| ... # Parse the XML column using the DDL-formatted schema |
| >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() |
| [Row(xml=Row(a=1))] |
| |
| Example 2: Parsing XML with a :class:`StructType` schema |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from pyspark.sql.types import StructType, LongType |
| >>> data = [(1, '''<p><a>1</a></p>''')] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> schema = StructType().add("a", LongType()) |
| >>> df.select(sf.from_xml(df.value, schema)).show() |
| +---------------+ |
| |from_xml(value)| |
| +---------------+ |
| | {1}| |
| +---------------+ |
| |
| Example 3: Parsing XML with :class:`ArrayType` in schema |
| |
| >>> import pyspark.sql.functions as sf |
| >>> data = [(1, '<p><a>1</a><a>2</a></p>')] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| ... # Define the schema with an Array type |
| >>> schema = "STRUCT<a: ARRAY<BIGINT>>" |
| ... # Parse the XML column using the schema with an Array |
| >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() |
| [Row(xml=Row(a=[1, 2]))] |
| |
| Example 4: Parsing XML using :meth:`pyspark.sql.functions.schema_of_xml` |
| |
| >>> import pyspark.sql.functions as sf |
| >>> # Sample data with an XML column |
| ... data = [(1, '<p><a>1</a><a>2</a></p>')] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| ... # Generate the schema from an example XML value |
| >>> schema = sf.schema_of_xml(sf.lit(data[0][1])) |
| ... # Parse the XML column using the generated schema |
| >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect() |
| [Row(xml=Row(a=[1, 2]))] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if isinstance(schema, StructType): |
| schema = schema.json() |
| elif isinstance(schema, Column): |
| schema = _to_java_column(schema) |
| elif not isinstance(schema, str): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR_OR_STRUCT", |
| messageParameters={"arg_name": "schema", "arg_type": type(schema).__name__}, |
| ) |
| return _invoke_function("from_xml", _to_java_column(col), schema, _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def schema_of_xml(xml: Union[Column, str], options: Optional[Mapping[str, str]] = None) -> Column: |
| """ |
| Parses a XML string and infers its schema in DDL format. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| xml : :class:`~pyspark.sql.Column` or str |
| a XML string or a foldable string column containing a XML string. |
| options : dict, optional |
| options to control parsing. accepts the same options as the XML datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a string representation of a :class:`StructType` parsed from given XML. |
| |
| Examples |
| -------- |
| Example 1: Parsing a simple XML with a single element |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.schema_of_xml(sf.lit('<p><a>1</a></p>')).alias("xml")).collect() |
| [Row(xml='STRUCT<a: BIGINT>')] |
| |
| Example 2: Parsing an XML with multiple elements in an array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df.select(sf.schema_of_xml(sf.lit('<p><a>1</a><a>2</a></p>')).alias("xml")).collect() |
| [Row(xml='STRUCT<a: ARRAY<BIGINT>>')] |
| |
| Example 3: Parsing XML with options to exclude attributes |
| |
| >>> from pyspark.sql import functions as sf |
| >>> schema = sf.schema_of_xml('<p><a attr="2">1</a></p>', {'excludeAttribute':'true'}) |
| >>> df.select(schema.alias("xml")).collect() |
| [Row(xml='STRUCT<a: BIGINT>')] |
| |
| Example 4: Parsing XML with complex structure |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df.select( |
| ... sf.schema_of_xml( |
| ... sf.lit('<root><person><name>Alice</name><age>30</age></person></root>') |
| ... ).alias("xml") |
| ... ).collect() |
| [Row(xml='STRUCT<person: STRUCT<age: BIGINT, name: STRING>>')] |
| |
| Example 5: Parsing XML with nested arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df.select( |
| ... sf.schema_of_xml( |
| ... sf.lit('<data><values><value>1</value><value>2</value></values></data>') |
| ... ).alias("xml") |
| ... ).collect() |
| [Row(xml='STRUCT<values: STRUCT<value: ARRAY<BIGINT>>>')] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| xml = _enum_to_value(xml) |
| if not isinstance(xml, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": "xml", "arg_type": type(xml).__name__}, |
| ) |
| |
| return _invoke_function("schema_of_xml", _to_java_column(lit(xml)), _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def to_xml(col: "ColumnOrName", options: Optional[Mapping[str, str]] = None) -> Column: |
| """ |
| Converts a column containing a :class:`StructType` into a XML string. |
| Throws an exception, in the case of an unsupported type. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column containing a struct. |
| options: dict, optional |
| options to control converting. accepts the same options as the XML datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a XML string converted from given :class:`StructType`. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import Row |
| >>> data = [(1, Row(age=2, name='Alice'))] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> df.select(to_xml(df.value, {'rowTag':'person'}).alias("xml")).collect() |
| [Row(xml='<person>\\n <age>2</age>\\n <name>Alice</name>\\n</person>')] |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("to_xml", _to_java_column(col), _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def schema_of_csv(csv: Union[Column, str], options: Optional[Mapping[str, str]] = None) -> Column: |
| """ |
| CSV Function: Parses a CSV string and infers its schema in DDL format. |
| |
| .. versionadded:: 3.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| csv : :class:`~pyspark.sql.Column` or str |
| A CSV string or a foldable string column containing a CSV string. |
| options : dict, optional |
| Options to control parsing. Accepts the same options as the CSV datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A string representation of a :class:`StructType` parsed from the given CSV. |
| |
| Examples |
| -------- |
| Example 1: Inferring the schema of a CSV string with different data types |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.schema_of_csv(sf.lit('1|a|true'), {'sep':'|'})).show(truncate=False) |
| +-------------------------------------------+ |
| |schema_of_csv(1|a|true) | |
| +-------------------------------------------+ |
| |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>| |
| +-------------------------------------------+ |
| |
| Example 2: Inferring the schema of a CSV string with missing values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.schema_of_csv(sf.lit('1||true'), {'sep':'|'})).show(truncate=False) |
| +-------------------------------------------+ |
| |schema_of_csv(1||true) | |
| +-------------------------------------------+ |
| |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>| |
| +-------------------------------------------+ |
| |
| Example 3: Inferring the schema of a CSV string with a different delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.schema_of_csv(sf.lit('1;a;true'), {'sep':';'})).show(truncate=False) |
| +-------------------------------------------+ |
| |schema_of_csv(1;a;true) | |
| +-------------------------------------------+ |
| |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>| |
| +-------------------------------------------+ |
| |
| Example 4: Inferring the schema of a CSV string with quoted fields |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.range(1) |
| >>> df.select(sf.schema_of_csv(sf.lit('"1","a","true"'), {'sep':','})).show(truncate=False) |
| +-------------------------------------------+ |
| |schema_of_csv("1","a","true") | |
| +-------------------------------------------+ |
| |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>| |
| +-------------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| csv = _enum_to_value(csv) |
| if not isinstance(csv, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": "csv", "arg_type": type(csv).__name__}, |
| ) |
| |
| return _invoke_function("schema_of_csv", _to_java_column(lit(csv)), _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def to_csv(col: "ColumnOrName", options: Optional[Mapping[str, str]] = None) -> Column: |
| """ |
| CSV Function: Converts a column containing a :class:`StructType` into a CSV string. |
| Throws an exception, in the case of an unsupported type. |
| |
| .. versionadded:: 3.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of column containing a struct. |
| options: dict, optional |
| Options to control converting. Accepts the same options as the CSV datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A CSV string converted from the given :class:`StructType`. |
| |
| Examples |
| -------- |
| Example 1: Converting a simple StructType to a CSV string |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> data = [(1, Row(age=2, name='Alice'))] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> df.select(sf.to_csv(df.value)).show() |
| +-------------+ |
| |to_csv(value)| |
| +-------------+ |
| | 2,Alice| |
| +-------------+ |
| |
| Example 2: Converting a complex StructType to a CSV string |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> data = [(1, Row(age=2, name='Alice', scores=[100, 200, 300]))] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> df.select(sf.to_csv(df.value)).show(truncate=False) |
| +-------------------------+ |
| |to_csv(value) | |
| +-------------------------+ |
| |2,Alice,"[100, 200, 300]"| |
| +-------------------------+ |
| |
| Example 3: Converting a StructType with null values to a CSV string |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType |
| >>> data = [(1, Row(age=None, name='Alice'))] |
| >>> schema = StructType([ |
| ... StructField("key", IntegerType(), True), |
| ... StructField("value", StructType([ |
| ... StructField("age", IntegerType(), True), |
| ... StructField("name", StringType(), True) |
| ... ]), True) |
| ... ]) |
| >>> df = spark.createDataFrame(data, schema) |
| >>> df.select(sf.to_csv(df.value)).show() |
| +-------------+ |
| |to_csv(value)| |
| +-------------+ |
| | ,Alice| |
| +-------------+ |
| |
| Example 4: Converting a StructType with different data types to a CSV string |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> data = [(1, Row(age=2, name='Alice', isStudent=True))] |
| >>> df = spark.createDataFrame(data, ("key", "value")) |
| >>> df.select(sf.to_csv(df.value)).show() |
| +-------------+ |
| |to_csv(value)| |
| +-------------+ |
| | 2,Alice,true| |
| +-------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("to_csv", _to_java_column(col), _options_to_str(options)) |
| |
| |
| @_try_remote_functions |
| def size(col: "ColumnOrName") -> Column: |
| """ |
| Collection function: returns the length of the array or map stored in the column. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| length of the array/map. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data']) |
| >>> df.select(size(df.data)).collect() |
| [Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)] |
| """ |
| return _invoke_function_over_columns("size", col) |
| |
| |
| @_try_remote_functions |
| def array_min(col: "ColumnOrName") -> Column: |
| """ |
| Array function: returns the minimum value of the array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or an expression that represents the array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains the minimum value of each array. |
| |
| Examples |
| -------- |
| Example 1: Basic usage with integer array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) |
| >>> df.select(sf.array_min(df.data)).show() |
| +---------------+ |
| |array_min(data)| |
| +---------------+ |
| | 1| |
| | -1| |
| +---------------+ |
| |
| Example 2: Usage with string array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 'banana', 'cherry'],)], ['data']) |
| >>> df.select(sf.array_min(df.data)).show() |
| +---------------+ |
| |array_min(data)| |
| +---------------+ |
| | apple| |
| +---------------+ |
| |
| Example 3: Usage with mixed type array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 1, 'cherry'],)], ['data']) |
| >>> df.select(sf.array_min(df.data)).show() |
| +---------------+ |
| |array_min(data)| |
| +---------------+ |
| | 1| |
| +---------------+ |
| |
| Example 4: Usage with array of arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([[2, 1], [3, 4]],)], ['data']) |
| >>> df.select(sf.array_min(df.data)).show() |
| +---------------+ |
| |array_min(data)| |
| +---------------+ |
| | [2, 1]| |
| +---------------+ |
| |
| Example 5: Usage with empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.array_min(df.data)).show() |
| +---------------+ |
| |array_min(data)| |
| +---------------+ |
| | NULL| |
| +---------------+ |
| """ |
| return _invoke_function_over_columns("array_min", col) |
| |
| |
| @_try_remote_functions |
| def array_max(col: "ColumnOrName") -> Column: |
| """ |
| Array function: returns the maximum value of the array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or an expression that represents the array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains the maximum value of each array. |
| |
| Examples |
| -------- |
| Example 1: Basic usage with integer array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) |
| >>> df.select(sf.array_max(df.data)).show() |
| +---------------+ |
| |array_max(data)| |
| +---------------+ |
| | 3| |
| | 10| |
| +---------------+ |
| |
| Example 2: Usage with string array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 'banana', 'cherry'],)], ['data']) |
| >>> df.select(sf.array_max(df.data)).show() |
| +---------------+ |
| |array_max(data)| |
| +---------------+ |
| | cherry| |
| +---------------+ |
| |
| Example 3: Usage with mixed type array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 1, 'cherry'],)], ['data']) |
| >>> df.select(sf.array_max(df.data)).show() |
| +---------------+ |
| |array_max(data)| |
| +---------------+ |
| | cherry| |
| +---------------+ |
| |
| Example 4: Usage with array of arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([[2, 1], [3, 4]],)], ['data']) |
| >>> df.select(sf.array_max(df.data)).show() |
| +---------------+ |
| |array_max(data)| |
| +---------------+ |
| | [3, 4]| |
| +---------------+ |
| |
| Example 5: Usage with empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.array_max(df.data)).show() |
| +---------------+ |
| |array_max(data)| |
| +---------------+ |
| | NULL| |
| +---------------+ |
| """ |
| return _invoke_function_over_columns("array_max", col) |
| |
| |
| @_try_remote_functions |
| def array_size(col: "ColumnOrName") -> Column: |
| """ |
| Array function: returns the total number of elements in the array. |
| The function returns null for null input. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or an expression that represents the array. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains the size of each array. |
| |
| Examples |
| -------- |
| Example 1: Basic usage with integer array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([2, 1, 3],), (None,)], ['data']) |
| >>> df.select(sf.array_size(df.data)).show() |
| +----------------+ |
| |array_size(data)| |
| +----------------+ |
| | 3| |
| | NULL| |
| +----------------+ |
| |
| Example 2: Usage with string array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 'banana', 'cherry'],)], ['data']) |
| >>> df.select(sf.array_size(df.data)).show() |
| +----------------+ |
| |array_size(data)| |
| +----------------+ |
| | 3| |
| +----------------+ |
| |
| Example 3: Usage with mixed type array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 1, 'cherry'],)], ['data']) |
| >>> df.select(sf.array_size(df.data)).show() |
| +----------------+ |
| |array_size(data)| |
| +----------------+ |
| | 3| |
| +----------------+ |
| |
| Example 4: Usage with array of arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([[2, 1], [3, 4]],)], ['data']) |
| >>> df.select(sf.array_size(df.data)).show() |
| +----------------+ |
| |array_size(data)| |
| +----------------+ |
| | 2| |
| +----------------+ |
| |
| Example 5: Usage with empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType(IntegerType()), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.array_size(df.data)).show() |
| +----------------+ |
| |array_size(data)| |
| +----------------+ |
| | 0| |
| +----------------+ |
| """ |
| return _invoke_function_over_columns("array_size", col) |
| |
| |
| @_try_remote_functions |
| def cardinality(col: "ColumnOrName") -> Column: |
| """ |
| Collection function: returns the length of the array or map stored in the column. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target column to compute on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| length of the array/map. |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.createDataFrame( |
| ... [([1, 2, 3],),([1],),([],)], ['data'] |
| ... ).select(sf.cardinality("data")).show() |
| +-----------------+ |
| |cardinality(data)| |
| +-----------------+ |
| | 3| |
| | 1| |
| | 0| |
| +-----------------+ |
| """ |
| return _invoke_function_over_columns("cardinality", col) |
| |
| |
| @_try_remote_functions |
| def sort_array(col: "ColumnOrName", asc: bool = True) -> Column: |
| """ |
| Array function: Sorts the input array in ascending or descending order according |
| to the natural ordering of the array elements. Null elements will be placed at the beginning |
| of the returned array in ascending order or at the end of the returned array in descending |
| order. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of the column or expression. |
| asc : bool, optional |
| Whether to sort in ascending or descending order. If `asc` is True (default), |
| then the sorting is in ascending order. If False, then in descending order. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Sorted array. |
| |
| Examples |
| -------- |
| Example 1: Sorting an array in ascending order |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([([2, 1, None, 3],)], ['data']) |
| >>> df.select(sf.sort_array(df.data)).show() |
| +----------------------+ |
| |sort_array(data, true)| |
| +----------------------+ |
| | [NULL, 1, 2, 3]| |
| +----------------------+ |
| |
| Example 2: Sorting an array in descending order |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([([2, 1, None, 3],)], ['data']) |
| >>> df.select(sf.sort_array(df.data, asc=False)).show() |
| +-----------------------+ |
| |sort_array(data, false)| |
| +-----------------------+ |
| | [3, 2, 1, NULL]| |
| +-----------------------+ |
| |
| Example 3: Sorting an array with a single element |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([([1],)], ['data']) |
| >>> df.select(sf.sort_array(df.data)).show() |
| +----------------------+ |
| |sort_array(data, true)| |
| +----------------------+ |
| | [1]| |
| +----------------------+ |
| |
| Example 4: Sorting an empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType |
| >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.sort_array(df.data)).show() |
| +----------------------+ |
| |sort_array(data, true)| |
| +----------------------+ |
| | []| |
| +----------------------+ |
| |
| Example 5: Sorting an array with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField |
| >>> schema = StructType([StructField("data", ArrayType(IntegerType()), True)]) |
| >>> df = spark.createDataFrame([([None, None, None],)], schema=schema) |
| >>> df.select(sf.sort_array(df.data)).show() |
| +----------------------+ |
| |sort_array(data, true)| |
| +----------------------+ |
| | [NULL, NULL, NULL]| |
| +----------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("sort_array", _to_java_column(col), _enum_to_value(asc)) |
| |
| |
| @_try_remote_functions |
| def array_sort( |
| col: "ColumnOrName", comparator: Optional[Callable[[Column, Column], Column]] = None |
| ) -> Column: |
| """ |
| Collection function: sorts the input array in ascending order. The elements of the input array |
| must be orderable. Null elements will be placed at the end of the returned array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Can take a `comparator` function. |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| comparator : callable, optional |
| A binary ``(Column, Column) -> Column: ...``. |
| The comparator will take two |
| arguments representing two elements of the array. It returns a negative integer, 0, or a |
| positive integer as the first element is less than, equal to, or greater than the second |
| element. If the comparator function returns null, the function will fail and raise an error. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| sorted array. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data']) |
| >>> df.select(array_sort(df.data).alias('r')).collect() |
| [Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])] |
| >>> df = spark.createDataFrame([(["foo", "foobar", None, "bar"],),(["foo"],),([],)], ['data']) |
| >>> df.select(array_sort( |
| ... "data", |
| ... lambda x, y: when(x.isNull() | y.isNull(), lit(0)).otherwise(length(y) - length(x)) |
| ... ).alias("r")).collect() |
| [Row(r=['foobar', 'foo', None, 'bar']), Row(r=['foo']), Row(r=[])] |
| """ |
| if comparator is None: |
| return _invoke_function_over_columns("array_sort", col) |
| else: |
| return _invoke_higher_order_function("array_sort", [col], [comparator]) |
| |
| |
| @_try_remote_functions |
| def shuffle(col: "ColumnOrName", seed: Optional[Union[Column, int]] = None) -> Column: |
| """ |
| Array function: Generates a random permutation of the given array. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or expression to be shuffled. |
| seed : :class:`~pyspark.sql.Column` or int, optional |
| Seed value for the random generator. |
| |
| .. versionadded:: 4.0.0 |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains an array of elements in random order. |
| |
| Notes |
| ----- |
| The `shuffle` function is non-deterministic, meaning the order of the output array |
| can be different for each execution. |
| |
| Examples |
| -------- |
| Example 1: Shuffling a simple array |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT ARRAY(1, 20, 3, 5) AS data") |
| >>> df.select("*", sf.shuffle(df.data, sf.lit(123))).show() |
| +-------------+-------------+ |
| | data|shuffle(data)| |
| +-------------+-------------+ |
| |[1, 20, 3, 5]|[5, 1, 20, 3]| |
| +-------------+-------------+ |
| |
| Example 2: Shuffling an array with null values |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT ARRAY(1, 20, NULL, 5) AS data") |
| >>> df.select("*", sf.shuffle(sf.col("data"), 234)).show() |
| +----------------+----------------+ |
| | data| shuffle(data)| |
| +----------------+----------------+ |
| |[1, 20, NULL, 5]|[NULL, 5, 20, 1]| |
| +----------------+----------------+ |
| |
| Example 3: Shuffling an array with duplicate values |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT ARRAY(1, 2, 2, 3, 3, 3) AS data") |
| >>> df.select("*", sf.shuffle("data", 345)).show() |
| +------------------+------------------+ |
| | data| shuffle(data)| |
| +------------------+------------------+ |
| |[1, 2, 2, 3, 3, 3]|[2, 3, 3, 1, 2, 3]| |
| +------------------+------------------+ |
| |
| Example 4: Shuffling an array with random seed |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.sql("SELECT ARRAY(1, 2, 2, 3, 3, 3) AS data") |
| >>> df.select("*", sf.shuffle("data")).show() # doctest: +SKIP |
| +------------------+------------------+ |
| | data| shuffle(data)| |
| +------------------+------------------+ |
| |[1, 2, 2, 3, 3, 3]|[3, 3, 2, 3, 2, 1]| |
| +------------------+------------------+ |
| """ |
| if seed is not None: |
| return _invoke_function_over_columns("shuffle", col, lit(seed)) |
| else: |
| return _invoke_function_over_columns("shuffle", col) |
| |
| |
| @_try_remote_functions |
| def reverse(col: "ColumnOrName") -> Column: |
| """ |
| Collection function: returns a reversed string or an array with elements in reverse order. |
| |
| .. versionadded:: 1.5.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or an expression that represents the element to be reversed. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a reversed string or an array with elements in reverse order. |
| |
| Examples |
| -------- |
| Example 1: Reverse a string |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('Spark SQL',)], ['data']) |
| >>> df.select(sf.reverse(df.data)).show() |
| +-------------+ |
| |reverse(data)| |
| +-------------+ |
| | LQS krapS| |
| +-------------+ |
| |
| Example 2: Reverse an array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data']) |
| >>> df.select(sf.reverse(df.data)).show() |
| +-------------+ |
| |reverse(data)| |
| +-------------+ |
| | [3, 1, 2]| |
| | [1]| |
| | []| |
| +-------------+ |
| """ |
| return _invoke_function_over_columns("reverse", col) |
| |
| |
| @_try_remote_functions |
| def flatten(col: "ColumnOrName") -> Column: |
| """ |
| Array function: creates a single array from an array of arrays. |
| If a structure of nested arrays is deeper than two levels, |
| only one level of nesting is removed. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or expression to be flattened. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains the flattened array. |
| |
| Examples |
| -------- |
| Example 1: Flattening a simple nested array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],)], ['data']) |
| >>> df.select(sf.flatten(df.data)).show() |
| +------------------+ |
| | flatten(data)| |
| +------------------+ |
| |[1, 2, 3, 4, 5, 6]| |
| +------------------+ |
| |
| Example 2: Flattening an array with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([None, [4, 5]],)], ['data']) |
| >>> df.select(sf.flatten(df.data)).show() |
| +-------------+ |
| |flatten(data)| |
| +-------------+ |
| | NULL| |
| +-------------+ |
| |
| Example 3: Flattening an array with more than two levels of nesting |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],)], ['data']) |
| >>> df.select(sf.flatten(df.data)).show(truncate=False) |
| +--------------------------------+ |
| |flatten(data) | |
| +--------------------------------+ |
| |[[1, 2], [3, 4], [5, 6], [7, 8]]| |
| +--------------------------------+ |
| |
| Example 4: Flattening an array with mixed types |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([['a', 'b', 'c'], [1, 2, 3]],)], ['data']) |
| >>> df.select(sf.flatten(df.data)).show() |
| +------------------+ |
| | flatten(data)| |
| +------------------+ |
| |[a, b, c, 1, 2, 3]| |
| +------------------+ |
| """ |
| return _invoke_function_over_columns("flatten", col) |
| |
| |
| @_try_remote_functions |
| def map_contains_key(col: "ColumnOrName", value: Any) -> Column: |
| """ |
| Map function: Returns true if the map contains the key. |
| |
| .. versionadded:: 3.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or an expression that represents the map. |
| value : |
| A literal value, or a :class:`~pyspark.sql.Column` expression. |
| |
| .. versionchanged:: 4.0.0 |
| `value` now also accepts a Column type. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| True if key is in the map and False otherwise. |
| |
| Examples |
| -------- |
| Example 1: The key is in the map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") |
| >>> df.select(sf.map_contains_key("data", 1)).show() |
| +-------------------------+ |
| |map_contains_key(data, 1)| |
| +-------------------------+ |
| | true| |
| +-------------------------+ |
| |
| Example 2: The key is not in the map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") |
| >>> df.select(sf.map_contains_key("data", -1)).show() |
| +--------------------------+ |
| |map_contains_key(data, -1)| |
| +--------------------------+ |
| | false| |
| +--------------------------+ |
| |
| Example 3: Check for key using a column |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key") |
| >>> df.select(sf.map_contains_key("data", sf.col("key"))).show() |
| +---------------------------+ |
| |map_contains_key(data, key)| |
| +---------------------------+ |
| | true| |
| +---------------------------+ |
| """ |
| return _invoke_function_over_columns("map_contains_key", col, lit(value)) |
| |
| |
| @_try_remote_functions |
| def map_keys(col: "ColumnOrName") -> Column: |
| """ |
| Map function: Returns an unordered array containing the keys of the map. |
| |
| .. versionadded:: 2.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Keys of the map as an array. |
| |
| Examples |
| -------- |
| Example 1: Extracting keys from a simple map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") |
| >>> df.select(sf.sort_array(sf.map_keys("data"))).show() |
| +--------------------------------+ |
| |sort_array(map_keys(data), true)| |
| +--------------------------------+ |
| | [1, 2]| |
| +--------------------------------+ |
| |
| Example 2: Extracting keys from a map with complex keys |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(array(1, 2), 'a', array(3, 4), 'b') as data") |
| >>> df.select(sf.sort_array(sf.map_keys("data"))).show() |
| +--------------------------------+ |
| |sort_array(map_keys(data), true)| |
| +--------------------------------+ |
| | [[1, 2], [3, 4]]| |
| +--------------------------------+ |
| |
| Example 3: Extracting keys from a map with duplicate keys |
| |
| >>> from pyspark.sql import functions as sf |
| >>> originalmapKeyDedupPolicy = spark.conf.get("spark.sql.mapKeyDedupPolicy") |
| >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", "LAST_WIN") |
| >>> df = spark.sql("SELECT map(1, 'a', 1, 'b') as data") |
| >>> df.select(sf.map_keys("data")).show() |
| +--------------+ |
| |map_keys(data)| |
| +--------------+ |
| | [1]| |
| +--------------+ |
| >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", originalmapKeyDedupPolicy) |
| |
| Example 4: Extracting keys from an empty map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map() as data") |
| >>> df.select(sf.map_keys("data")).show() |
| +--------------+ |
| |map_keys(data)| |
| +--------------+ |
| | []| |
| +--------------+ |
| """ |
| return _invoke_function_over_columns("map_keys", col) |
| |
| |
| @_try_remote_functions |
| def map_values(col: "ColumnOrName") -> Column: |
| """ |
| Map function: Returns an unordered array containing the values of the map. |
| |
| .. versionadded:: 2.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Values of the map as an array. |
| |
| Examples |
| -------- |
| Example 1: Extracting values from a simple map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") |
| >>> df.select(sf.sort_array(sf.map_values("data"))).show() |
| +----------------------------------+ |
| |sort_array(map_values(data), true)| |
| +----------------------------------+ |
| | [a, b]| |
| +----------------------------------+ |
| |
| Example 2: Extracting values from a map with complex values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, array('a', 'b'), 2, array('c', 'd')) as data") |
| >>> df.select(sf.sort_array(sf.map_values("data"))).show() |
| +----------------------------------+ |
| |sort_array(map_values(data), true)| |
| +----------------------------------+ |
| | [[a, b], [c, d]]| |
| +----------------------------------+ |
| |
| Example 3: Extracting values from a map with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, null, 2, 'b') as data") |
| >>> df.select(sf.sort_array(sf.map_values("data"))).show() |
| +----------------------------------+ |
| |sort_array(map_values(data), true)| |
| +----------------------------------+ |
| | [NULL, b]| |
| +----------------------------------+ |
| |
| Example 4: Extracting values from a map with duplicate values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'a') as data") |
| >>> df.select(sf.map_values("data")).show() |
| +----------------+ |
| |map_values(data)| |
| +----------------+ |
| | [a, a]| |
| +----------------+ |
| |
| Example 5: Extracting values from an empty map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map() as data") |
| >>> df.select(sf.map_values("data")).show() |
| +----------------+ |
| |map_values(data)| |
| +----------------+ |
| | []| |
| +----------------+ |
| """ |
| return _invoke_function_over_columns("map_values", col) |
| |
| |
| @_try_remote_functions |
| def map_entries(col: "ColumnOrName") -> Column: |
| """ |
| Map function: Returns an unordered array of all entries in the given map. |
| |
| .. versionadded:: 3.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| An array of key value pairs as a struct type |
| |
| Examples |
| -------- |
| Example 1: Extracting entries from a simple map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data") |
| >>> df.select(sf.sort_array(sf.map_entries("data"))).show() |
| +-----------------------------------+ |
| |sort_array(map_entries(data), true)| |
| +-----------------------------------+ |
| | [{1, a}, {2, b}]| |
| +-----------------------------------+ |
| |
| Example 2: Extracting entries from a map with complex keys and values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(array(1, 2), array('a', 'b'), " |
| ... "array(3, 4), array('c', 'd')) as data") |
| >>> df.select(sf.sort_array(sf.map_entries("data"))).show(truncate=False) |
| +------------------------------------+ |
| |sort_array(map_entries(data), true) | |
| +------------------------------------+ |
| |[{[1, 2], [a, b]}, {[3, 4], [c, d]}]| |
| +------------------------------------+ |
| |
| Example 3: Extracting entries from a map with duplicate keys |
| |
| >>> from pyspark.sql import functions as sf |
| >>> originalmapKeyDedupPolicy = spark.conf.get("spark.sql.mapKeyDedupPolicy") |
| >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", "LAST_WIN") |
| >>> df = spark.sql("SELECT map(1, 'a', 1, 'b') as data") |
| >>> df.select(sf.map_entries("data")).show() |
| +-----------------+ |
| |map_entries(data)| |
| +-----------------+ |
| | [{1, b}]| |
| +-----------------+ |
| >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", originalmapKeyDedupPolicy) |
| |
| Example 4: Extracting entries from an empty map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map() as data") |
| >>> df.select(sf.map_entries("data")).show() |
| +-----------------+ |
| |map_entries(data)| |
| +-----------------+ |
| | []| |
| +-----------------+ |
| """ |
| return _invoke_function_over_columns("map_entries", col) |
| |
| |
| @_try_remote_functions |
| def map_from_entries(col: "ColumnOrName") -> Column: |
| """ |
| Map function: Transforms an array of key-value pair entries (structs with two fields) |
| into a map. The first field of each entry is used as the key and the second field |
| as the value in the resulting map column |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| Name of column or expression |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A map created from the given array of entries. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of map_from_entries |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data") |
| >>> df.select(sf.map_from_entries(df.data)).show() |
| +----------------------+ |
| |map_from_entries(data)| |
| +----------------------+ |
| | {1 -> a, 2 -> b}| |
| +----------------------+ |
| |
| Example 2: map_from_entries with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT array(struct(1, null), struct(2, 'b')) as data") |
| >>> df.select(sf.map_from_entries(df.data)).show() |
| +----------------------+ |
| |map_from_entries(data)| |
| +----------------------+ |
| | {1 -> NULL, 2 -> b}| |
| +----------------------+ |
| |
| Example 3: map_from_entries with a DataFrame |
| |
| >>> from pyspark.sql import Row, functions as sf |
| >>> df = spark.createDataFrame([([Row(1, "a"), Row(2, "b")],), ([Row(3, "c")],)], ['data']) |
| >>> df.select(sf.map_from_entries(df.data)).show() |
| +----------------------+ |
| |map_from_entries(data)| |
| +----------------------+ |
| | {1 -> a, 2 -> b}| |
| | {3 -> c}| |
| +----------------------+ |
| |
| Example 4: map_from_entries with empty array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import ArrayType, StringType, IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", ArrayType( |
| ... StructType([ |
| ... StructField("key", IntegerType()), |
| ... StructField("value", StringType()) |
| ... ]) |
| ... ), True) |
| ... ]) |
| >>> df = spark.createDataFrame([([],)], schema=schema) |
| >>> df.select(sf.map_from_entries(df.data)).show() |
| +----------------------+ |
| |map_from_entries(data)| |
| +----------------------+ |
| | {}| |
| +----------------------+ |
| """ |
| return _invoke_function_over_columns("map_from_entries", col) |
| |
| |
| @_try_remote_functions |
| def array_repeat(col: "ColumnOrName", count: Union["ColumnOrName", int]) -> Column: |
| """ |
| Array function: creates an array containing a column repeated count times. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or an expression that represents the element to be repeated. |
| count : :class:`~pyspark.sql.Column` or str or int |
| The name of the column, an expression, |
| or an integer that represents the number of times to repeat the element. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains an array of repeated elements. |
| |
| Examples |
| -------- |
| Example 1: Usage with string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([('ab',)], ['data']) |
| >>> df.select(sf.array_repeat(df.data, 3)).show() |
| +---------------------+ |
| |array_repeat(data, 3)| |
| +---------------------+ |
| | [ab, ab, ab]| |
| +---------------------+ |
| |
| Example 2: Usage with integer |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(3,)], ['data']) |
| >>> df.select(sf.array_repeat(df.data, 2)).show() |
| +---------------------+ |
| |array_repeat(data, 2)| |
| +---------------------+ |
| | [3, 3]| |
| +---------------------+ |
| |
| Example 3: Usage with array |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(['apple', 'banana'],)], ['data']) |
| >>> df.select(sf.array_repeat(df.data, 2)).show(truncate=False) |
| +----------------------------------+ |
| |array_repeat(data, 2) | |
| +----------------------------------+ |
| |[[apple, banana], [apple, banana]]| |
| +----------------------------------+ |
| |
| Example 4: Usage with null |
| |
| >>> from pyspark.sql import functions as sf |
| >>> from pyspark.sql.types import IntegerType, StructType, StructField |
| >>> schema = StructType([ |
| ... StructField("data", IntegerType(), True) |
| ... ]) |
| >>> df = spark.createDataFrame([(None, )], schema=schema) |
| >>> df.select(sf.array_repeat(df.data, 3)).show() |
| +---------------------+ |
| |array_repeat(data, 3)| |
| +---------------------+ |
| | [NULL, NULL, NULL]| |
| +---------------------+ |
| """ |
| count = _enum_to_value(count) |
| count = lit(count) if isinstance(count, int) else count |
| |
| return _invoke_function_over_columns("array_repeat", col, count) |
| |
| |
| @_try_remote_functions |
| def arrays_zip(*cols: "ColumnOrName") -> Column: |
| """ |
| Array function: Returns a merged array of structs in which the N-th struct contains all |
| N-th values of input arrays. If one of the arrays is shorter than others then |
| the resulting struct type value will be a `null` for missing elements. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or str |
| Columns of arrays to be merged. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| Merged array of entries. |
| |
| Examples |
| -------- |
| Example 1: Zipping two arrays of the same length |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, 3], ['a', 'b', 'c'])], ['nums', 'letters']) |
| >>> df.select(sf.arrays_zip(df.nums, df.letters)).show(truncate=False) |
| +-------------------------+ |
| |arrays_zip(nums, letters)| |
| +-------------------------+ |
| |[{1, a}, {2, b}, {3, c}] | |
| +-------------------------+ |
| |
| |
| Example 2: Zipping arrays of different lengths |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2], ['a', 'b', 'c'])], ['nums', 'letters']) |
| >>> df.select(sf.arrays_zip(df.nums, df.letters)).show(truncate=False) |
| +---------------------------+ |
| |arrays_zip(nums, letters) | |
| +---------------------------+ |
| |[{1, a}, {2, b}, {NULL, c}]| |
| +---------------------------+ |
| |
| Example 3: Zipping more than two arrays |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame( |
| ... [([1, 2], ['a', 'b'], [True, False])], ['nums', 'letters', 'bools']) |
| >>> df.select(sf.arrays_zip(df.nums, df.letters, df.bools)).show(truncate=False) |
| +--------------------------------+ |
| |arrays_zip(nums, letters, bools)| |
| +--------------------------------+ |
| |[{1, a, true}, {2, b, false}] | |
| +--------------------------------+ |
| |
| Example 4: Zipping arrays with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([([1, 2, None], ['a', None, 'c'])], ['nums', 'letters']) |
| >>> df.select(sf.arrays_zip(df.nums, df.letters)).show(truncate=False) |
| +------------------------------+ |
| |arrays_zip(nums, letters) | |
| +------------------------------+ |
| |[{1, a}, {2, NULL}, {NULL, c}]| |
| +------------------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("arrays_zip", cols) |
| |
| |
| @overload |
| def map_concat(*cols: "ColumnOrName") -> Column: |
| ... |
| |
| |
| @overload |
| def map_concat(__cols: Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def map_concat( |
| *cols: Union["ColumnOrName", Union[Sequence["ColumnOrName"], Tuple["ColumnOrName", ...]]] |
| ) -> Column: |
| """ |
| Map function: Returns the union of all given maps. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or str |
| Column names or :class:`~pyspark.sql.Column` |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A map of merged entries from other maps. |
| |
| Notes |
| ----- |
| For duplicate keys in input maps, the handling is governed by `spark.sql.mapKeyDedupPolicy`. |
| By default, it throws an exception. If set to `LAST_WIN`, it uses the last map's value. |
| |
| Examples |
| -------- |
| Example 1: Basic usage of map_concat |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2") |
| >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) |
| +------------------------+ |
| |map_concat(map1, map2) | |
| +------------------------+ |
| |{1 -> a, 2 -> b, 3 -> c}| |
| +------------------------+ |
| |
| Example 2: map_concat with overlapping keys |
| |
| >>> from pyspark.sql import functions as sf |
| >>> originalmapKeyDedupPolicy = spark.conf.get("spark.sql.mapKeyDedupPolicy") |
| >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", "LAST_WIN") |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(2, 'c', 3, 'd') as map2") |
| >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) |
| +------------------------+ |
| |map_concat(map1, map2) | |
| +------------------------+ |
| |{1 -> a, 2 -> c, 3 -> d}| |
| +------------------------+ |
| >>> spark.conf.set("spark.sql.mapKeyDedupPolicy", originalmapKeyDedupPolicy) |
| |
| Example 3: map_concat with three maps |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a') as map1, map(2, 'b') as map2, map(3, 'c') as map3") |
| >>> df.select(sf.map_concat("map1", "map2", "map3")).show(truncate=False) |
| +----------------------------+ |
| |map_concat(map1, map2, map3)| |
| +----------------------------+ |
| |{1 -> a, 2 -> b, 3 -> c} | |
| +----------------------------+ |
| |
| Example 4: map_concat with empty map |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map() as map2") |
| >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) |
| +----------------------+ |
| |map_concat(map1, map2)| |
| +----------------------+ |
| |{1 -> a, 2 -> b} | |
| +----------------------+ |
| |
| Example 5: map_concat with null values |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, null) as map2") |
| >>> df.select(sf.map_concat("map1", "map2")).show(truncate=False) |
| +---------------------------+ |
| |map_concat(map1, map2) | |
| +---------------------------+ |
| |{1 -> a, 2 -> b, 3 -> NULL}| |
| +---------------------------+ |
| """ |
| if len(cols) == 1 and isinstance(cols[0], (list, set)): |
| cols = cols[0] # type: ignore[assignment] |
| return _invoke_function_over_seq_of_columns("map_concat", cols) # type: ignore[arg-type] |
| |
| |
| @_try_remote_functions |
| def sequence( |
| start: "ColumnOrName", stop: "ColumnOrName", step: Optional["ColumnOrName"] = None |
| ) -> Column: |
| """ |
| Array function: Generate a sequence of integers from `start` to `stop`, incrementing by `step`. |
| If `step` is not set, the function increments by 1 if `start` is less than or equal to `stop`, |
| otherwise it decrements by 1. |
| |
| .. versionadded:: 2.4.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| start : :class:`~pyspark.sql.Column` or str |
| The starting value (inclusive) of the sequence. |
| stop : :class:`~pyspark.sql.Column` or str |
| The last value (inclusive) of the sequence. |
| step : :class:`~pyspark.sql.Column` or str, optional |
| The value to add to the current element to get the next element in the sequence. |
| The default is 1 if `start` is less than or equal to `stop`, otherwise -1. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains an array of sequence values. |
| |
| Examples |
| -------- |
| Example 1: Generating a sequence with default step |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(-2, 2)], ['start', 'stop']) |
| >>> df.select(sf.sequence(df.start, df.stop)).show() |
| +---------------------+ |
| |sequence(start, stop)| |
| +---------------------+ |
| | [-2, -1, 0, 1, 2]| |
| +---------------------+ |
| |
| Example 2: Generating a sequence with a custom step |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(4, -4, -2)], ['start', 'stop', 'step']) |
| >>> df.select(sf.sequence(df.start, df.stop, df.step)).show() |
| +---------------------------+ |
| |sequence(start, stop, step)| |
| +---------------------------+ |
| | [4, 2, 0, -2, -4]| |
| +---------------------------+ |
| |
| |
| Example 3: Generating a sequence with a negative step |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(5, 1, -1)], ['start', 'stop', 'step']) |
| >>> df.select(sf.sequence(df.start, df.stop, df.step)).show() |
| +---------------------------+ |
| |sequence(start, stop, step)| |
| +---------------------------+ |
| | [5, 4, 3, 2, 1]| |
| +---------------------------+ |
| """ |
| if step is None: |
| return _invoke_function_over_columns("sequence", start, stop) |
| else: |
| return _invoke_function_over_columns("sequence", start, stop, step) |
| |
| |
| @_try_remote_functions |
| def from_csv( |
| col: "ColumnOrName", |
| schema: Union[Column, str], |
| options: Optional[Mapping[str, str]] = None, |
| ) -> Column: |
| """ |
| CSV Function: Parses a column containing a CSV string into a row with the specified schema. |
| Returns `null` if the string cannot be parsed. |
| |
| .. versionadded:: 3.0.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| A column or column name in CSV format. |
| schema : :class:`~pyspark.sql.Column` or str |
| A column, or Python string literal with schema in DDL format, to use when parsing the CSV column. |
| options : dict, optional |
| Options to control parsing. Accepts the same options as the CSV datasource. |
| See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_ |
| for the version you use. |
| |
| .. # noqa |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column of parsed CSV values. |
| |
| Examples |
| -------- |
| Example 1: Parsing a simple CSV string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> data = [("1,2,3",)] |
| >>> df = spark.createDataFrame(data, ("value",)) |
| >>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT")).show() |
| +---------------+ |
| |from_csv(value)| |
| +---------------+ |
| | {1, 2, 3}| |
| +---------------+ |
| |
| Example 2: Using schema_of_csv to infer the schema |
| |
| >>> from pyspark.sql import functions as sf |
| >>> data = [("1,2,3",)] |
| >>> value = data[0][0] |
| >>> df.select(sf.from_csv(df.value, sf.schema_of_csv(value))).show() |
| +---------------+ |
| |from_csv(value)| |
| +---------------+ |
| | {1, 2, 3}| |
| +---------------+ |
| |
| Example 3: Ignoring leading white space in the CSV string |
| |
| >>> from pyspark.sql import functions as sf |
| >>> data = [(" abc",)] |
| >>> df = spark.createDataFrame(data, ("value",)) |
| >>> options = {'ignoreLeadingWhiteSpace': True} |
| >>> df.select(sf.from_csv(df.value, "s string", options)).show() |
| +---------------+ |
| |from_csv(value)| |
| +---------------+ |
| | {abc}| |
| +---------------+ |
| |
| Example 4: Parsing a CSV string with a missing value |
| |
| >>> from pyspark.sql import functions as sf |
| >>> data = [("1,2,",)] |
| >>> df = spark.createDataFrame(data, ("value",)) |
| >>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT")).show() |
| +---------------+ |
| |from_csv(value)| |
| +---------------+ |
| | {1, 2, NULL}| |
| +---------------+ |
| |
| Example 5: Parsing a CSV string with a different delimiter |
| |
| >>> from pyspark.sql import functions as sf |
| >>> data = [("1;2;3",)] |
| >>> df = spark.createDataFrame(data, ("value",)) |
| >>> options = {'delimiter': ';'} |
| >>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT", options)).show() |
| +---------------+ |
| |from_csv(value)| |
| +---------------+ |
| | {1, 2, 3}| |
| +---------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if not isinstance(schema, (str, Column)): |
| raise PySparkTypeError( |
| errorClass="NOT_COLUMN_OR_STR", |
| messageParameters={"arg_name": "schema", "arg_type": type(schema).__name__}, |
| ) |
| |
| return _invoke_function( |
| "from_csv", _to_java_column(col), _to_java_column(lit(schema)), _options_to_str(options) |
| ) |
| |
| |
| def _unresolved_named_lambda_variable(name: str) -> Column: |
| """ |
| Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`, |
| convert it to o.s.sql.Column and wrap in Python `Column` |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| name_parts : str |
| """ |
| from py4j.java_gateway import JVMView |
| |
| sc = _get_active_spark_context() |
| return Column(cast(JVMView, sc._jvm).PythonSQLUtils.unresolvedNamedLambdaVariable(name)) |
| |
| |
| def _get_lambda_parameters(f: Callable) -> ValuesView[inspect.Parameter]: |
| signature = inspect.signature(f) |
| parameters = signature.parameters.values() |
| |
| # We should exclude functions that use |
| # variable args and keyword argnames |
| # as well as keyword only args |
| supported_parameter_types = { |
| inspect.Parameter.POSITIONAL_OR_KEYWORD, |
| inspect.Parameter.POSITIONAL_ONLY, |
| } |
| |
| # Validate that |
| # function arity is between 1 and 3 |
| if not (1 <= len(parameters) <= 3): |
| raise PySparkValueError( |
| errorClass="WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION", |
| messageParameters={"func_name": f.__name__, "num_args": str(len(parameters))}, |
| ) |
| |
| # and all arguments can be used as positional |
| if not all(p.kind in supported_parameter_types for p in parameters): |
| raise PySparkValueError( |
| errorClass="UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION", |
| messageParameters={"func_name": f.__name__}, |
| ) |
| |
| return parameters |
| |
| |
| def _create_lambda(f: Callable) -> Callable: |
| """ |
| Create `o.a.s.sql.expressions.LambdaFunction` corresponding |
| to transformation described by f |
| |
| :param f: A Python of one of the following forms: |
| - (Column) -> Column: ... |
| - (Column, Column) -> Column: ... |
| - (Column, Column, Column) -> Column: ... |
| """ |
| from py4j.java_gateway import JVMView |
| from pyspark.sql.classic.column import _to_seq |
| |
| parameters = _get_lambda_parameters(f) |
| |
| sc = _get_active_spark_context() |
| |
| argnames = ["x", "y", "z"] |
| args = [_unresolved_named_lambda_variable(arg) for arg in argnames[: len(parameters)]] |
| |
| result = f(*args) |
| |
| if not isinstance(result, Column): |
| raise PySparkValueError( |
| errorClass="HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN", |
| messageParameters={"func_name": f.__name__, "return_type": type(result).__name__}, |
| ) |
| |
| jexpr = result._jc |
| jargs = _to_seq(sc, [arg._jc for arg in args]) |
| return cast(JVMView, sc._jvm).PythonSQLUtils.lambdaFunction(jexpr, jargs) |
| |
| |
| def _invoke_higher_order_function( |
| name: str, |
| cols: Sequence["ColumnOrName"], |
| funs: Sequence[Callable], |
| ) -> Column: |
| """ |
| Invokes expression identified by name, |
| (relative to ```org.apache.spark.sql.catalyst.expressions``) |
| and wraps the result with Column (first Scala one, then Python). |
| |
| :param name: Name of the expression |
| :param cols: a list of columns |
| :param funs: a list of (*Column) -> Column functions. |
| |
| :return: a Column |
| """ |
| from py4j.java_gateway import JVMView |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| jfuns = [_create_lambda(f) for f in funs] |
| jcols = [_to_java_column(c) for c in cols] |
| return Column(cast(JVMView, sc._jvm).PythonSQLUtils.fn(name, _to_seq(sc, jcols + jfuns))) |
| |
| |
| @overload |
| def transform(col: "ColumnOrName", f: Callable[[Column], Column]) -> Column: |
| ... |
| |
| |
| @overload |
| def transform(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def transform( |
| col: "ColumnOrName", |
| f: Union[Callable[[Column], Column], Callable[[Column, Column], Column]], |
| ) -> Column: |
| """ |
| Returns an array of elements after applying a transformation to each element in the input array. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| f : function |
| a function that is applied to each element of the input array. |
| Can take one of the following forms: |
| |
| - Unary ``(x: Column) -> Column: ...`` |
| - Binary ``(x: Column, i: Column) -> Column...``, where the second argument is |
| a 0-based index of the element. |
| |
| and can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new array of transformed elements. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values")) |
| >>> df.select(transform("values", lambda x: x * 2).alias("doubled")).show() |
| +------------+ |
| | doubled| |
| +------------+ |
| |[2, 4, 6, 8]| |
| +------------+ |
| |
| >>> def alternate(x, i): |
| ... return when(i % 2 == 0, x).otherwise(-x) |
| ... |
| >>> df.select(transform("values", alternate).alias("alternated")).show() |
| +--------------+ |
| | alternated| |
| +--------------+ |
| |[1, -2, 3, -4]| |
| +--------------+ |
| """ |
| return _invoke_higher_order_function("transform", [col], [f]) |
| |
| |
| @_try_remote_functions |
| def exists(col: "ColumnOrName", f: Callable[[Column], Column]) -> Column: |
| """ |
| Returns whether a predicate holds for one or more elements in the array. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| f : function |
| ``(x: Column) -> Column: ...`` returning the Boolean expression. |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| True if "any" element of an array evaluates to True when passed as an argument to |
| given function and False otherwise. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],("key", "values")) |
| >>> df.select(exists("values", lambda x: x < 0).alias("any_negative")).show() |
| +------------+ |
| |any_negative| |
| +------------+ |
| | false| |
| | true| |
| +------------+ |
| """ |
| return _invoke_higher_order_function("exists", [col], [f]) |
| |
| |
| @_try_remote_functions |
| def forall(col: "ColumnOrName", f: Callable[[Column], Column]) -> Column: |
| """ |
| Returns whether a predicate holds for every element in the array. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| f : function |
| ``(x: Column) -> Column: ...`` returning the Boolean expression. |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| True if "all" elements of an array evaluates to True when passed as an argument to |
| given function and False otherwise. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame( |
| ... [(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])], |
| ... ("key", "values") |
| ... ) |
| >>> df.select(forall("values", lambda x: x.rlike("foo")).alias("all_foo")).show() |
| +-------+ |
| |all_foo| |
| +-------+ |
| | false| |
| | false| |
| | true| |
| +-------+ |
| """ |
| return _invoke_higher_order_function("forall", [col], [f]) |
| |
| |
| @overload |
| def filter(col: "ColumnOrName", f: Callable[[Column], Column]) -> Column: |
| ... |
| |
| |
| @overload |
| def filter(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def filter( |
| col: "ColumnOrName", |
| f: Union[Callable[[Column], Column], Callable[[Column, Column], Column]], |
| ) -> Column: |
| """ |
| Returns an array of elements for which a predicate holds in a given array. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| f : function |
| A function that returns the Boolean expression. |
| Can take one of the following forms: |
| |
| - Unary ``(x: Column) -> Column: ...`` |
| - Binary ``(x: Column, i: Column) -> Column...``, where the second argument is |
| a 0-based index of the element. |
| |
| and can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| filtered array of elements where given function evaluated to True |
| when passed as an argument. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame( |
| ... [(1, ["2018-09-20", "2019-02-03", "2019-07-01", "2020-06-01"])], |
| ... ("key", "values") |
| ... ) |
| >>> def after_second_quarter(x): |
| ... return month(to_date(x)) > 6 |
| ... |
| >>> df.select( |
| ... filter("values", after_second_quarter).alias("after_second_quarter") |
| ... ).show(truncate=False) |
| +------------------------+ |
| |after_second_quarter | |
| +------------------------+ |
| |[2018-09-20, 2019-07-01]| |
| +------------------------+ |
| """ |
| return _invoke_higher_order_function("filter", [col], [f]) |
| |
| |
| @_try_remote_functions |
| def aggregate( |
| col: "ColumnOrName", |
| initialValue: "ColumnOrName", |
| merge: Callable[[Column, Column], Column], |
| finish: Optional[Callable[[Column], Column]] = None, |
| ) -> Column: |
| """ |
| Applies a binary operator to an initial state and all elements in the array, |
| and reduces this to a single state. The final state is converted into the final result |
| by applying a finish function. |
| |
| Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| initialValue : :class:`~pyspark.sql.Column` or str |
| initial value. Name of column or expression |
| merge : function |
| a binary function ``(acc: Column, x: Column) -> Column...`` returning expression |
| of the same type as ``initialValue`` |
| finish : function, optional |
| an optional unary function ``(x: Column) -> Column: ...`` |
| used to convert accumulated value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| final value after aggregate function is applied. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values")) |
| >>> df.select(aggregate("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show() |
| +----+ |
| | sum| |
| +----+ |
| |42.0| |
| +----+ |
| |
| >>> def merge(acc, x): |
| ... count = acc.count + 1 |
| ... sum = acc.sum + x |
| ... return struct(count.alias("count"), sum.alias("sum")) |
| ... |
| >>> df.select( |
| ... aggregate( |
| ... "values", |
| ... struct(lit(0).alias("count"), lit(0.0).alias("sum")), |
| ... merge, |
| ... lambda acc: acc.sum / acc.count, |
| ... ).alias("mean") |
| ... ).show() |
| +----+ |
| |mean| |
| +----+ |
| | 8.4| |
| +----+ |
| """ |
| if finish is not None: |
| return _invoke_higher_order_function("aggregate", [col, initialValue], [merge, finish]) |
| |
| else: |
| return _invoke_higher_order_function("aggregate", [col, initialValue], [merge]) |
| |
| |
| @_try_remote_functions |
| def reduce( |
| col: "ColumnOrName", |
| initialValue: "ColumnOrName", |
| merge: Callable[[Column, Column], Column], |
| finish: Optional[Callable[[Column], Column]] = None, |
| ) -> Column: |
| """ |
| Applies a binary operator to an initial state and all elements in the array, |
| and reduces this to a single state. The final state is converted into the final result |
| by applying a finish function. |
| |
| Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| initialValue : :class:`~pyspark.sql.Column` or str |
| initial value. Name of column or expression |
| merge : function |
| a binary function ``(acc: Column, x: Column) -> Column...`` returning expression |
| of the same type as ``zero`` |
| finish : function, optional |
| an optional unary function ``(x: Column) -> Column: ...`` |
| used to convert accumulated value. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| final value after aggregate function is applied. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values")) |
| >>> df.select(reduce("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show() |
| +----+ |
| | sum| |
| +----+ |
| |42.0| |
| +----+ |
| |
| >>> def merge(acc, x): |
| ... count = acc.count + 1 |
| ... sum = acc.sum + x |
| ... return struct(count.alias("count"), sum.alias("sum")) |
| ... |
| >>> df.select( |
| ... reduce( |
| ... "values", |
| ... struct(lit(0).alias("count"), lit(0.0).alias("sum")), |
| ... merge, |
| ... lambda acc: acc.sum / acc.count, |
| ... ).alias("mean") |
| ... ).show() |
| +----+ |
| |mean| |
| +----+ |
| | 8.4| |
| +----+ |
| """ |
| if finish is not None: |
| return _invoke_higher_order_function("reduce", [col, initialValue], [merge, finish]) |
| |
| else: |
| return _invoke_higher_order_function("reduce", [col, initialValue], [merge]) |
| |
| |
| @_try_remote_functions |
| def zip_with( |
| left: "ColumnOrName", |
| right: "ColumnOrName", |
| f: Callable[[Column, Column], Column], |
| ) -> Column: |
| """ |
| Merge two given arrays, element-wise, into a single array using a function. |
| If one array is shorter, nulls are appended at the end to match the length of the longer |
| array, before applying the function. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| left : :class:`~pyspark.sql.Column` or str |
| name of the first column or expression |
| right : :class:`~pyspark.sql.Column` or str |
| name of the second column or expression |
| f : function |
| a binary function ``(x1: Column, x2: Column) -> Column...`` |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| array of calculated values derived by applying given function to each pair of arguments. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], ("id", "xs", "ys")) |
| >>> df.select(zip_with("xs", "ys", lambda x, y: x ** y).alias("powers")).show(truncate=False) |
| +---------------------------+ |
| |powers | |
| +---------------------------+ |
| |[1.0, 9.0, 625.0, 262144.0]| |
| +---------------------------+ |
| |
| >>> df = spark.createDataFrame([(1, ["foo", "bar"], [1, 2, 3])], ("id", "xs", "ys")) |
| >>> df.select(zip_with("xs", "ys", lambda x, y: concat_ws("_", x, y)).alias("xs_ys")).show() |
| +-----------------+ |
| | xs_ys| |
| +-----------------+ |
| |[foo_1, bar_2, 3]| |
| +-----------------+ |
| """ |
| return _invoke_higher_order_function("zip_with", [left, right], [f]) |
| |
| |
| @_try_remote_functions |
| def transform_keys(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -> Column: |
| """ |
| Applies a function to every key-value pair in a map and returns |
| a map with the results of those applications as the new keys for the pairs. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| f : function |
| a binary function ``(k: Column, v: Column) -> Column...`` |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new map of entries where new keys were calculated by applying given function to |
| each key value argument. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, {"foo": -2.0, "bar": 2.0})], ("id", "data")) |
| >>> row = df.select(transform_keys( |
| ... "data", lambda k, _: upper(k)).alias("data_upper") |
| ... ).head() |
| >>> sorted(row["data_upper"].items()) |
| [('BAR', 2.0), ('FOO', -2.0)] |
| """ |
| return _invoke_higher_order_function("transform_keys", [col], [f]) |
| |
| |
| @_try_remote_functions |
| def transform_values(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -> Column: |
| """ |
| Applies a function to every key-value pair in a map and returns |
| a map with the results of those applications as the new values for the pairs. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| name of column or expression |
| f : function |
| a binary function ``(k: Column, v: Column) -> Column...`` |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| a new map of entries where new values were calculated by applying given function to |
| each key value argument. |
| |
| Examples |
| -------- |
| >>> df = spark.createDataFrame([(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data")) |
| >>> row = df.select(transform_values( |
| ... "data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v) |
| ... ).alias("new_data")).head() |
| >>> sorted(row["new_data"].items()) |
| [('IT', 20.0), ('OPS', 34.0), ('SALES', 2.0)] |
| """ |
| return _invoke_higher_order_function("transform_values", [col], [f]) |
| |
| |
| @_try_remote_functions |
| def map_filter(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -> Column: |
| """ |
| Collection function: Returns a new map column whose key-value pairs satisfy a given |
| predicate function. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| The name of the column or a column expression representing the map to be filtered. |
| f : function |
| A binary function ``(k: Column, v: Column) -> Column...`` that defines the predicate. |
| This function should return a boolean column that will be used to filter the input map. |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new map column containing only the key-value pairs that satisfy the predicate. |
| |
| Examples |
| -------- |
| Example 1: Filtering a map with a simple condition |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data")) |
| >>> row = df.select( |
| ... sf.map_filter("data", lambda _, v: v > 30.0).alias("data_filtered") |
| ... ).head() |
| >>> sorted(row["data_filtered"].items()) |
| [('baz', 32.0), ('foo', 42.0)] |
| |
| Example 2: Filtering a map with a condition on keys |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data")) |
| >>> row = df.select( |
| ... sf.map_filter("data", lambda k, _: k.startswith("b")).alias("data_filtered") |
| ... ).head() |
| >>> sorted(row["data_filtered"].items()) |
| [('bar', 1.0), ('baz', 32.0)] |
| |
| Example 3: Filtering a map with a complex condition |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data")) |
| >>> row = df.select( |
| ... sf.map_filter("data", lambda k, v: k.startswith("b") & (v > 1.0)).alias("data_filtered") |
| ... ).head() |
| >>> sorted(row["data_filtered"].items()) |
| [('baz', 32.0)] |
| """ |
| return _invoke_higher_order_function("map_filter", [col], [f]) |
| |
| |
| @_try_remote_functions |
| def map_zip_with( |
| col1: "ColumnOrName", |
| col2: "ColumnOrName", |
| f: Callable[[Column, Column, Column], Column], |
| ) -> Column: |
| """ |
| Collection: Merges two given maps into a single map by applying a function to |
| the key-value pairs. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or str |
| The name of the first column or a column expression representing the first map. |
| col2 : :class:`~pyspark.sql.Column` or str |
| The name of the second column or a column expression representing the second map. |
| f : function |
| A ternary function ``(k: Column, v1: Column, v2: Column) -> Column...`` that defines |
| how to merge the values from the two maps. This function should return a column that |
| will be used as the value in the resulting map. |
| Can use methods of :class:`~pyspark.sql.Column`, functions defined in |
| :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``. |
| Python ``UserDefinedFunctions`` are not supported |
| (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new map column where each key-value pair is the result of applying the function to |
| the corresponding key-value pairs in the input maps. |
| |
| Examples |
| -------- |
| Example 1: Merging two maps with a simple function |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (1, {"A": 1, "B": 2}, {"A": 3, "B": 4})], |
| ... ("id", "map1", "map2")) |
| >>> row = df.select( |
| ... sf.map_zip_with("map1", "map2", lambda _, v1, v2: v1 + v2).alias("updated_data") |
| ... ).head() |
| >>> sorted(row["updated_data"].items()) |
| [('A', 4), ('B', 6)] |
| |
| Example 2: Merging two maps with a complex function |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (1, {"A": 1, "B": 2}, {"A": 3, "B": 4})], |
| ... ("id", "map1", "map2")) |
| >>> row = df.select( |
| ... sf.map_zip_with("map1", "map2", |
| ... lambda k, v1, v2: sf.when(k == "A", v1 + v2).otherwise(v1 - v2) |
| ... ).alias("updated_data") |
| ... ).head() |
| >>> sorted(row["updated_data"].items()) |
| [('A', 4), ('B', -2)] |
| |
| Example 3: Merging two maps with mismatched keys |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([ |
| ... (1, {"A": 1, "B": 2}, {"B": 3, "C": 4})], |
| ... ("id", "map1", "map2")) |
| >>> row = df.select( |
| ... sf.map_zip_with("map1", "map2", |
| ... lambda _, v1, v2: sf.when(v2.isNull(), v1).otherwise(v1 + v2) |
| ... ).alias("updated_data") |
| ... ).head() |
| >>> sorted(row["updated_data"].items()) |
| [('A', 1), ('B', 5), ('C', None)] |
| """ |
| return _invoke_higher_order_function("map_zip_with", [col1, col2], [f]) |
| |
| |
| @_try_remote_functions |
| def str_to_map( |
| text: "ColumnOrName", |
| pairDelim: Optional["ColumnOrName"] = None, |
| keyValueDelim: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Map function: Converts a string into a map after splitting the text into key/value pairs |
| using delimiters. Both `pairDelim` and `keyValueDelim` are treated as regular expressions. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| text : :class:`~pyspark.sql.Column` or str |
| Input column or strings. |
| pairDelim : :class:`~pyspark.sql.Column` or str, optional |
| Delimiter to use to split pairs. Default is comma (,). |
| keyValueDelim : :class:`~pyspark.sql.Column` or str, optional |
| Delimiter to use to split key/value. Default is colon (:). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column of map type where each string in the original column is converted into a map. |
| |
| Examples |
| -------- |
| Example 1: Using default delimiters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"]) |
| >>> df.select(sf.str_to_map(df.e)).show(truncate=False) |
| +------------------------+ |
| |str_to_map(e, ,, :) | |
| +------------------------+ |
| |{a -> 1, b -> 2, c -> 3}| |
| +------------------------+ |
| |
| Example 2: Using custom delimiters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a=1;b=2;c=3",)], ["e"]) |
| >>> df.select(sf.str_to_map(df.e, sf.lit(";"), sf.lit("="))).show(truncate=False) |
| +------------------------+ |
| |str_to_map(e, ;, =) | |
| +------------------------+ |
| |{a -> 1, b -> 2, c -> 3}| |
| +------------------------+ |
| |
| Example 3: Using different delimiters for different rows |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a:1,b:2,c:3",), ("d=4;e=5;f=6",)], ["e"]) |
| >>> df.select(sf.str_to_map(df.e, |
| ... sf.when(df.e.contains(";"), sf.lit(";")).otherwise(sf.lit(",")), |
| ... sf.when(df.e.contains("="), sf.lit("=")).otherwise(sf.lit(":"))).alias("str_to_map") |
| ... ).show(truncate=False) |
| +------------------------+ |
| |str_to_map | |
| +------------------------+ |
| |{a -> 1, b -> 2, c -> 3}| |
| |{d -> 4, e -> 5, f -> 6}| |
| +------------------------+ |
| |
| Example 4: Using a column of delimiters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a:1,b:2,c:3", ","), ("d=4;e=5;f=6", ";")], ["e", "delim"]) |
| >>> df.select(sf.str_to_map(df.e, df.delim, sf.lit(":"))).show(truncate=False) |
| +---------------------------------------+ |
| |str_to_map(e, delim, :) | |
| +---------------------------------------+ |
| |{a -> 1, b -> 2, c -> 3} | |
| |{d=4 -> NULL, e=5 -> NULL, f=6 -> NULL}| |
| +---------------------------------------+ |
| |
| Example 5: Using a column of key/value delimiters |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a:1,b:2,c:3", ":"), ("d=4;e=5;f=6", "=")], ["e", "delim"]) |
| >>> df.select(sf.str_to_map(df.e, sf.lit(","), df.delim)).show(truncate=False) |
| +------------------------+ |
| |str_to_map(e, ,, delim) | |
| +------------------------+ |
| |{a -> 1, b -> 2, c -> 3}| |
| |{d -> 4;e=5;f=6} | |
| +------------------------+ |
| """ |
| if pairDelim is None: |
| pairDelim = lit(",") |
| if keyValueDelim is None: |
| keyValueDelim = lit(":") |
| return _invoke_function_over_columns("str_to_map", text, pairDelim, keyValueDelim) |
| |
| |
| # ---------------------- Partition transform functions -------------------------------- |
| |
| |
| @_try_remote_functions |
| def years(col: "ColumnOrName") -> Column: |
| """ |
| Partition transform function: A transform for timestamps and dates |
| to partition data into years. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 4.0.0 |
| Use :func:`partitioning.years` instead. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target date or timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| data partitioned by years. |
| |
| Examples |
| -------- |
| >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP |
| ... years("ts") |
| ... ).createOrReplace() |
| |
| Notes |
| ----- |
| This function can be used only in combination with |
| :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` |
| method of the `DataFrameWriterV2`. |
| |
| """ |
| from pyspark.sql.functions import partitioning |
| |
| warnings.warn("Deprecated in 4.0.0, use partitioning.years instead.", FutureWarning) |
| |
| return partitioning.years(col) |
| |
| |
| @_try_remote_functions |
| def months(col: "ColumnOrName") -> Column: |
| """ |
| Partition transform function: A transform for timestamps and dates |
| to partition data into months. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 4.0.0 |
| Use :func:`partitioning.months` instead. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target date or timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| data partitioned by months. |
| |
| Examples |
| -------- |
| >>> df.writeTo("catalog.db.table").partitionedBy( |
| ... months("ts") |
| ... ).createOrReplace() # doctest: +SKIP |
| |
| Notes |
| ----- |
| This function can be used only in combination with |
| :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` |
| method of the `DataFrameWriterV2`. |
| |
| """ |
| from pyspark.sql.functions import partitioning |
| |
| warnings.warn("Deprecated in 4.0.0, use partitioning.months instead.", FutureWarning) |
| |
| return partitioning.months(col) |
| |
| |
| @_try_remote_functions |
| def days(col: "ColumnOrName") -> Column: |
| """ |
| Partition transform function: A transform for timestamps and dates |
| to partition data into days. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 4.0.0 |
| Use :func:`partitioning.months` instead. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target date or timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| data partitioned by days. |
| |
| Examples |
| -------- |
| >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP |
| ... days("ts") |
| ... ).createOrReplace() |
| |
| Notes |
| ----- |
| This function can be used only in combination with |
| :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` |
| method of the `DataFrameWriterV2`. |
| |
| """ |
| from pyspark.sql.functions import partitioning |
| |
| warnings.warn("Deprecated in 4.0.0, use partitioning.days instead.", FutureWarning) |
| |
| return partitioning.days(col) |
| |
| |
| @_try_remote_functions |
| def hours(col: "ColumnOrName") -> Column: |
| """ |
| Partition transform function: A transform for timestamps |
| to partition data into hours. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 4.0.0 |
| Use :func:`partitioning.hours` instead. |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or str |
| target date or timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| data partitioned by hours. |
| |
| Examples |
| -------- |
| >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP |
| ... hours("ts") |
| ... ).createOrReplace() |
| |
| Notes |
| ----- |
| This function can be used only in combination with |
| :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` |
| method of the `DataFrameWriterV2`. |
| |
| """ |
| from pyspark.sql.functions import partitioning |
| |
| warnings.warn("Deprecated in 4.0.0, use partitioning.hours instead.", FutureWarning) |
| |
| return partitioning.hours(col) |
| |
| |
| @_try_remote_functions |
| def convert_timezone( |
| sourceTz: Optional[Column], targetTz: Column, sourceTs: "ColumnOrName" |
| ) -> Column: |
| """ |
| Converts the timestamp without time zone `sourceTs` |
| from the `sourceTz` time zone to `targetTz`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| sourceTz : :class:`~pyspark.sql.Column`, optional |
| The time zone for the input timestamp. If it is missed, |
| the current session time zone is used as the source time zone. |
| targetTz : :class:`~pyspark.sql.Column` |
| The time zone to which the input timestamp should be converted. |
| sourceTs : :class:`~pyspark.sql.Column` or column name |
| A timestamp without time zone. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a timestamp for converted time zone. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.current_timezone` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Converts the timestamp without time zone `sourceTs`. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 00:00:00',)], ['ts']) |
| >>> df.select( |
| ... '*', |
| ... sf.convert_timezone(None, sf.lit('Asia/Hong_Kong'), 'ts') |
| ... ).show() # doctest: +SKIP |
| +-------------------+--------------------------------------------------------+ |
| | ts|convert_timezone(current_timezone(), Asia/Hong_Kong, ts)| |
| +-------------------+--------------------------------------------------------+ |
| |2015-04-08 00:00:00| 2015-04-08 15:00:00| |
| +-------------------+--------------------------------------------------------+ |
| |
| Example 2: Converts the timestamp with time zone `sourceTs`. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('2015-04-08 15:00:00',)], ['ts']) |
| >>> df.select( |
| ... '*', |
| ... sf.convert_timezone(sf.lit('Asia/Hong_Kong'), sf.lit('America/Los_Angeles'), df.ts) |
| ... ).show() |
| +-------------------+---------------------------------------------------------+ |
| | ts|convert_timezone(Asia/Hong_Kong, America/Los_Angeles, ts)| |
| +-------------------+---------------------------------------------------------+ |
| |2015-04-08 15:00:00| 2015-04-08 00:00:00| |
| +-------------------+---------------------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if sourceTz is None: |
| return _invoke_function_over_columns("convert_timezone", targetTz, sourceTs) |
| else: |
| return _invoke_function_over_columns("convert_timezone", sourceTz, targetTz, sourceTs) |
| |
| |
| @_try_remote_functions |
| def make_dt_interval( |
| days: Optional["ColumnOrName"] = None, |
| hours: Optional["ColumnOrName"] = None, |
| mins: Optional["ColumnOrName"] = None, |
| secs: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Make DayTimeIntervalType duration from days, hours, mins and secs. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| days : :class:`~pyspark.sql.Column` or column name, optional |
| The number of days, positive or negative. |
| hours : :class:`~pyspark.sql.Column` or column name, optional |
| The number of hours, positive or negative. |
| mins : :class:`~pyspark.sql.Column` or column name, optional |
| The number of minutes, positive or negative. |
| secs : :class:`~pyspark.sql.Column` or column name, optional |
| The number of seconds with the fractional part in microsecond precision. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a DayTimeIntervalType duration. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.make_ym_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| Example 1: Make DayTimeIntervalType duration from days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) |
| >>> df.select('*', sf.make_dt_interval(df.day, df.hour, df.min, df.sec)).show(truncate=False) |
| +---+----+---+--------+------------------------------------------+ |
| |day|hour|min|sec |make_dt_interval(day, hour, min, sec) | |
| +---+----+---+--------+------------------------------------------+ |
| |1 |12 |30 |1.001001|INTERVAL '1 12:30:01.001001' DAY TO SECOND| |
| +---+----+---+--------+------------------------------------------+ |
| |
| Example 2: Make DayTimeIntervalType duration from days, hours and mins. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) |
| >>> df.select('*', sf.make_dt_interval(df.day, 'hour', df.min)).show(truncate=False) |
| +---+----+---+--------+-----------------------------------+ |
| |day|hour|min|sec |make_dt_interval(day, hour, min, 0)| |
| +---+----+---+--------+-----------------------------------+ |
| |1 |12 |30 |1.001001|INTERVAL '1 12:30:00' DAY TO SECOND| |
| +---+----+---+--------+-----------------------------------+ |
| |
| Example 3: Make DayTimeIntervalType duration from days and hours. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) |
| >>> df.select('*', sf.make_dt_interval(df.day, df.hour)).show(truncate=False) |
| +---+----+---+--------+-----------------------------------+ |
| |day|hour|min|sec |make_dt_interval(day, hour, 0, 0) | |
| +---+----+---+--------+-----------------------------------+ |
| |1 |12 |30 |1.001001|INTERVAL '1 12:00:00' DAY TO SECOND| |
| +---+----+---+--------+-----------------------------------+ |
| |
| Example 4: Make DayTimeIntervalType duration from days. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]], ['day', 'hour', 'min', 'sec']) |
| >>> df.select('*', sf.make_dt_interval('day')).show(truncate=False) |
| +---+----+---+--------+-----------------------------------+ |
| |day|hour|min|sec |make_dt_interval(day, 0, 0, 0) | |
| +---+----+---+--------+-----------------------------------+ |
| |1 |12 |30 |1.001001|INTERVAL '1 00:00:00' DAY TO SECOND| |
| +---+----+---+--------+-----------------------------------+ |
| |
| Example 5: Make empty interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.make_dt_interval()).show(truncate=False) |
| +-----------------------------------+ |
| |make_dt_interval(0, 0, 0, 0) | |
| +-----------------------------------+ |
| |INTERVAL '0 00:00:00' DAY TO SECOND| |
| +-----------------------------------+ |
| """ |
| _days = lit(0) if days is None else days |
| _hours = lit(0) if hours is None else hours |
| _mins = lit(0) if mins is None else mins |
| _secs = lit(decimal.Decimal(0)) if secs is None else secs |
| return _invoke_function_over_columns("make_dt_interval", _days, _hours, _mins, _secs) |
| |
| |
| @_try_remote_functions |
| def try_make_interval( |
| years: Optional["ColumnOrName"] = None, |
| months: Optional["ColumnOrName"] = None, |
| weeks: Optional["ColumnOrName"] = None, |
| days: Optional["ColumnOrName"] = None, |
| hours: Optional["ColumnOrName"] = None, |
| mins: Optional["ColumnOrName"] = None, |
| secs: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| This is a special version of `make_interval` that performs the same operation, but returns a |
| NULL value instead of raising an error if interval cannot be created. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name, optional |
| The number of years, positive or negative. |
| months : :class:`~pyspark.sql.Column` or column name, optional |
| The number of months, positive or negative. |
| weeks : :class:`~pyspark.sql.Column` or column name, optional |
| The number of weeks, positive or negative. |
| days : :class:`~pyspark.sql.Column` or column name, optional |
| The number of days, positive or negative. |
| hours : :class:`~pyspark.sql.Column` or column name, optional |
| The number of hours, positive or negative. |
| mins : :class:`~pyspark.sql.Column` or column name, optional |
| The number of minutes, positive or negative. |
| secs : :class:`~pyspark.sql.Column` or column name, optional |
| The number of seconds with the fractional part in microsecond precision. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains an interval. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.make_dt_interval` |
| :meth:`pyspark.sql.functions.make_ym_interval` |
| |
| Examples |
| -------- |
| Example 1: Try make interval from years, months, weeks, days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.try_make_interval(df.year, df.month, 'week', df.day, 'hour', df.min, df.sec) |
| ... ).show(truncate=False) |
| +---------------------------------------------------------------+ |
| |try_make_interval(year, month, week, day, hour, min, sec) | |
| +---------------------------------------------------------------+ |
| |100 years 11 months 8 days 12 hours 30 minutes 1.001001 seconds| |
| +---------------------------------------------------------------+ |
| |
| Example 2: Try make interval from years, months, weeks, days, hours and mins. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.try_make_interval(df.year, df.month, 'week', df.day, df.hour, df.min) |
| ... ).show(truncate=False) |
| +-------------------------------------------------------+ |
| |try_make_interval(year, month, week, day, hour, min, 0)| |
| +-------------------------------------------------------+ |
| |100 years 11 months 8 days 12 hours 30 minutes | |
| +-------------------------------------------------------+ |
| |
| Example 3: Try make interval from years, months, weeks, days and hours. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.try_make_interval(df.year, df.month, 'week', df.day, df.hour) |
| ... ).show(truncate=False) |
| +-----------------------------------------------------+ |
| |try_make_interval(year, month, week, day, hour, 0, 0)| |
| +-----------------------------------------------------+ |
| |100 years 11 months 8 days 12 hours | |
| +-----------------------------------------------------+ |
| |
| Example 4: Try make interval from years, months, weeks and days. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.try_make_interval(df.year, 'month', df.week, df.day)).show(truncate=False) |
| +--------------------------------------------------+ |
| |try_make_interval(year, month, week, day, 0, 0, 0)| |
| +--------------------------------------------------+ |
| |100 years 11 months 8 days | |
| +--------------------------------------------------+ |
| |
| Example 5: Try make interval from years, months and weeks. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.try_make_interval(df.year, 'month', df.week)).show(truncate=False) |
| +------------------------------------------------+ |
| |try_make_interval(year, month, week, 0, 0, 0, 0)| |
| +------------------------------------------------+ |
| |100 years 11 months 7 days | |
| +------------------------------------------------+ |
| |
| Example 6: Try make interval from years and months. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.try_make_interval(df.year, 'month')).show(truncate=False) |
| +---------------------------------------------+ |
| |try_make_interval(year, month, 0, 0, 0, 0, 0)| |
| +---------------------------------------------+ |
| |100 years 11 months | |
| +---------------------------------------------+ |
| |
| Example 7: Try make interval from years. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.try_make_interval(df.year)).show(truncate=False) |
| +-----------------------------------------+ |
| |try_make_interval(year, 0, 0, 0, 0, 0, 0)| |
| +-----------------------------------------+ |
| |100 years | |
| +-----------------------------------------+ |
| |
| Example 8: Try make empty interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.try_make_interval()).show(truncate=False) |
| +--------------------------------------+ |
| |try_make_interval(0, 0, 0, 0, 0, 0, 0)| |
| +--------------------------------------+ |
| |0 seconds | |
| +--------------------------------------+ |
| |
| Example 9: Try make interval from years with overflow. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.try_make_interval(sf.lit(2147483647))).show(truncate=False) |
| +-----------------------------------------------+ |
| |try_make_interval(2147483647, 0, 0, 0, 0, 0, 0)| |
| +-----------------------------------------------+ |
| |NULL | |
| +-----------------------------------------------+ |
| """ |
| _years = lit(0) if years is None else years |
| _months = lit(0) if months is None else months |
| _weeks = lit(0) if weeks is None else weeks |
| _days = lit(0) if days is None else days |
| _hours = lit(0) if hours is None else hours |
| _mins = lit(0) if mins is None else mins |
| _secs = lit(decimal.Decimal(0)) if secs is None else secs |
| return _invoke_function_over_columns( |
| "try_make_interval", _years, _months, _weeks, _days, _hours, _mins, _secs |
| ) |
| |
| |
| @_try_remote_functions |
| def make_interval( |
| years: Optional["ColumnOrName"] = None, |
| months: Optional["ColumnOrName"] = None, |
| weeks: Optional["ColumnOrName"] = None, |
| days: Optional["ColumnOrName"] = None, |
| hours: Optional["ColumnOrName"] = None, |
| mins: Optional["ColumnOrName"] = None, |
| secs: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Make interval from years, months, weeks, days, hours, mins and secs. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name, optional |
| The number of years, positive or negative. |
| months : :class:`~pyspark.sql.Column` or column name, optional |
| The number of months, positive or negative. |
| weeks : :class:`~pyspark.sql.Column` or column name, optional |
| The number of weeks, positive or negative. |
| days : :class:`~pyspark.sql.Column` or column name, optional |
| The number of days, positive or negative. |
| hours : :class:`~pyspark.sql.Column` or column name, optional |
| The number of hours, positive or negative. |
| mins : :class:`~pyspark.sql.Column` or column name, optional |
| The number of minutes, positive or negative. |
| secs : :class:`~pyspark.sql.Column` or column name, optional |
| The number of seconds with the fractional part in microsecond precision. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains an interval. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_dt_interval` |
| :meth:`pyspark.sql.functions.make_ym_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| Example 1: Make interval from years, months, weeks, days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.make_interval(df.year, df.month, 'week', df.day, df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +---------------------------------------------------------------+ |
| |make_interval(year, month, week, day, hour, min, sec) | |
| +---------------------------------------------------------------+ |
| |100 years 11 months 8 days 12 hours 30 minutes 1.001001 seconds| |
| +---------------------------------------------------------------+ |
| |
| Example 2: Make interval from years, months, weeks, days, hours and mins. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.make_interval(df.year, df.month, 'week', df.day, df.hour, df.min) |
| ... ).show(truncate=False) |
| +---------------------------------------------------+ |
| |make_interval(year, month, week, day, hour, min, 0)| |
| +---------------------------------------------------+ |
| |100 years 11 months 8 days 12 hours 30 minutes | |
| +---------------------------------------------------+ |
| |
| Example 3: Make interval from years, months, weeks, days and hours. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.make_interval(df.year, df.month, 'week', df.day, df.hour) |
| ... ).show(truncate=False) |
| +-------------------------------------------------+ |
| |make_interval(year, month, week, day, hour, 0, 0)| |
| +-------------------------------------------------+ |
| |100 years 11 months 8 days 12 hours | |
| +-------------------------------------------------+ |
| |
| Example 4: Make interval from years, months, weeks and days. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.make_interval(df.year, df.month, 'week', df.day)).show(truncate=False) |
| +----------------------------------------------+ |
| |make_interval(year, month, week, day, 0, 0, 0)| |
| +----------------------------------------------+ |
| |100 years 11 months 8 days | |
| +----------------------------------------------+ |
| |
| Example 5: Make interval from years, months and weeks. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.make_interval(df.year, df.month, 'week')).show(truncate=False) |
| +--------------------------------------------+ |
| |make_interval(year, month, week, 0, 0, 0, 0)| |
| +--------------------------------------------+ |
| |100 years 11 months 7 days | |
| +--------------------------------------------+ |
| |
| Example 6: Make interval from years and months. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.make_interval(df.year, df.month)).show(truncate=False) |
| +-----------------------------------------+ |
| |make_interval(year, month, 0, 0, 0, 0, 0)| |
| +-----------------------------------------+ |
| |100 years 11 months | |
| +-----------------------------------------+ |
| |
| Example 7: Make interval from years. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]], |
| ... ['year', 'month', 'week', 'day', 'hour', 'min', 'sec']) |
| >>> df.select(sf.make_interval(df.year)).show(truncate=False) |
| +-------------------------------------+ |
| |make_interval(year, 0, 0, 0, 0, 0, 0)| |
| +-------------------------------------+ |
| |100 years | |
| +-------------------------------------+ |
| |
| Example 8: Make empty interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.make_interval()).show(truncate=False) |
| +----------------------------------+ |
| |make_interval(0, 0, 0, 0, 0, 0, 0)| |
| +----------------------------------+ |
| |0 seconds | |
| +----------------------------------+ |
| """ |
| _years = lit(0) if years is None else years |
| _months = lit(0) if months is None else months |
| _weeks = lit(0) if weeks is None else weeks |
| _days = lit(0) if days is None else days |
| _hours = lit(0) if hours is None else hours |
| _mins = lit(0) if mins is None else mins |
| _secs = lit(decimal.Decimal(0)) if secs is None else secs |
| return _invoke_function_over_columns( |
| "make_interval", _years, _months, _weeks, _days, _hours, _mins, _secs |
| ) |
| |
| |
| @_try_remote_functions |
| def make_time(hour: "ColumnOrName", minute: "ColumnOrName", second: "ColumnOrName") -> Column: |
| """ |
| Create time from hour, minute and second fields. For invalid inputs it will throw an error. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| hour : :class:`~pyspark.sql.Column` or column name |
| The hour to represent, from 0 to 23. |
| minute : :class:`~pyspark.sql.Column` or column name |
| The minute to represent, from 0 to 59. |
| second : :class:`~pyspark.sql.Column` or column name |
| The second to represent, from 0 to 59.999999. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A column representing the created time. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(6, 30, 45.887)], ["hour", "minute", "second"]) |
| >>> df.select(sf.make_time("hour", "minute", "second").alias("time")).show() |
| +------------+ |
| | time| |
| +------------+ |
| |06:30:45.887| |
| +------------+ |
| """ |
| return _invoke_function_over_columns("make_time", hour, minute, second) |
| |
| |
| @_try_remote_functions |
| def make_timestamp( |
| years: "ColumnOrName", |
| months: "ColumnOrName", |
| days: "ColumnOrName", |
| hours: "ColumnOrName", |
| mins: "ColumnOrName", |
| secs: "ColumnOrName", |
| timezone: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Create timestamp from years, months, days, hours, mins, secs and timezone fields. |
| The result data type is consistent with the value of configuration `spark.sql.timestampType`. |
| If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL |
| on invalid inputs. Otherwise, it will throw an error instead. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name |
| The year to represent, from 1 to 9999 |
| months : :class:`~pyspark.sql.Column` or column name |
| The month-of-year to represent, from 1 (January) to 12 (December) |
| days : :class:`~pyspark.sql.Column` or column name |
| The day-of-month to represent, from 1 to 31 |
| hours : :class:`~pyspark.sql.Column` or column name |
| The hour-of-day to represent, from 0 to 23 |
| mins : :class:`~pyspark.sql.Column` or column name |
| The minute-of-hour to represent, from 0 to 59 |
| secs : :class:`~pyspark.sql.Column` or column name |
| The second-of-minute and its micro-fraction to represent, from 0 to 60. |
| The value can be either an integer like 13 , or a fraction like 13.123. |
| If the sec argument equals to 60, the seconds field is set |
| to 0 and 1 minute is added to the final timestamp. |
| timezone : :class:`~pyspark.sql.Column` or column name, optional |
| The time zone identifier. For example, CET, UTC and etc. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a timestamp. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.try_make_timestamp` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.make_time` |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make timestamp from years, months, days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec, 'tz') |
| ... ).show(truncate=False) |
| +----------------------------------------------------+ |
| |make_timestamp(year, month, day, hour, min, sec, tz)| |
| +----------------------------------------------------+ |
| |2014-12-27 21:30:45.887 | |
| +----------------------------------------------------+ |
| |
| Example 2: Make timestamp without timezone. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec) |
| ... ).show(truncate=False) |
| +------------------------------------------------+ |
| |make_timestamp(year, month, day, hour, min, sec)| |
| +------------------------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +------------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if timezone is not None: |
| return _invoke_function_over_columns( |
| "make_timestamp", years, months, days, hours, mins, secs, timezone |
| ) |
| else: |
| return _invoke_function_over_columns( |
| "make_timestamp", years, months, days, hours, mins, secs |
| ) |
| |
| |
| @_try_remote_functions |
| def try_make_timestamp( |
| years: "ColumnOrName", |
| months: "ColumnOrName", |
| days: "ColumnOrName", |
| hours: "ColumnOrName", |
| mins: "ColumnOrName", |
| secs: "ColumnOrName", |
| timezone: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Try to create timestamp from years, months, days, hours, mins, secs and timezone fields. |
| The result data type is consistent with the value of configuration `spark.sql.timestampType`. |
| The function returns NULL on invalid inputs. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name |
| The year to represent, from 1 to 9999 |
| months : :class:`~pyspark.sql.Column` or column name |
| The month-of-year to represent, from 1 (January) to 12 (December) |
| days : :class:`~pyspark.sql.Column` or column name |
| The day-of-month to represent, from 1 to 31 |
| hours : :class:`~pyspark.sql.Column` or column name |
| The hour-of-day to represent, from 0 to 23 |
| mins : :class:`~pyspark.sql.Column` or column name |
| The minute-of-hour to represent, from 0 to 59 |
| secs : :class:`~pyspark.sql.Column` or column name |
| The second-of-minute and its micro-fraction to represent, from 0 to 60. |
| The value can be either an integer like 13 , or a fraction like 13.123. |
| If the sec argument equals to 60, the seconds field is set |
| to 0 and 1 minute is added to the final timestamp. |
| timezone : :class:`~pyspark.sql.Column` or column name, optional |
| The time zone identifier. For example, CET, UTC and etc. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a timestamp or NULL in case of an error. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp` |
| :meth:`pyspark.sql.functions.make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.make_time` |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make timestamp from years, months, days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.try_make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec, 'tz') |
| ... ).show(truncate=False) |
| +----------------------------------------------------+ |
| |try_make_timestamp(year, month, day, hour, min, sec)| |
| +----------------------------------------------------+ |
| |2014-12-27 21:30:45.887 | |
| +----------------------------------------------------+ |
| |
| Example 2: Make timestamp without timezone. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.try_make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec) |
| ... ).show(truncate=False) |
| +----------------------------------------------------+ |
| |try_make_timestamp(year, month, day, hour, min, sec)| |
| +----------------------------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +----------------------------------------------------+ |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| |
| Example 3: Make timestamp with invalid input. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 13, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.try_make_timestamp(df.year, df.month, df.day, 'hour', df.min, df.sec) |
| ... ).show(truncate=False) |
| +----------------------------------------------------+ |
| |try_make_timestamp(year, month, day, hour, min, sec)| |
| +----------------------------------------------------+ |
| |NULL | |
| +----------------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if timezone is not None: |
| return _invoke_function_over_columns( |
| "try_make_timestamp", years, months, days, hours, mins, secs, timezone |
| ) |
| else: |
| return _invoke_function_over_columns( |
| "try_make_timestamp", years, months, days, hours, mins, secs |
| ) |
| |
| |
| @_try_remote_functions |
| def make_timestamp_ltz( |
| years: "ColumnOrName", |
| months: "ColumnOrName", |
| days: "ColumnOrName", |
| hours: "ColumnOrName", |
| mins: "ColumnOrName", |
| secs: "ColumnOrName", |
| timezone: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Create the current timestamp with local time zone from years, months, days, hours, mins, |
| secs and timezone fields. If the configuration `spark.sql.ansi.enabled` is false, |
| the function returns NULL on invalid inputs. Otherwise, it will throw an error instead. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or str |
| The year to represent, from 1 to 9999 |
| months : :class:`~pyspark.sql.Column` or str |
| The month-of-year to represent, from 1 (January) to 12 (December) |
| days : :class:`~pyspark.sql.Column` or str |
| The day-of-month to represent, from 1 to 31 |
| hours : :class:`~pyspark.sql.Column` or str |
| The hour-of-day to represent, from 0 to 23 |
| mins : :class:`~pyspark.sql.Column` or str |
| The minute-of-hour to represent, from 0 to 59 |
| secs : :class:`~pyspark.sql.Column` or str |
| The second-of-minute and its micro-fraction to represent, from 0 to 60. |
| The value can be either an integer like 13 , or a fraction like 13.123. |
| If the sec argument equals to 60, the seconds field is set |
| to 0 and 1 minute is added to the final timestamp. |
| timezone : :class:`~pyspark.sql.Column` or str, optional |
| The time zone identifier. For example, CET, UTC and etc. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a current timestamp. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp` |
| :meth:`pyspark.sql.functions.make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.try_make_timestamp` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.make_time` |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make the current timestamp from years, months, days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.make_timestamp_ltz(df.year, df.month, 'day', df.hour, df.min, df.sec, 'tz') |
| ... ).show(truncate=False) |
| +--------------------------------------------------------+ |
| |make_timestamp_ltz(year, month, day, hour, min, sec, tz)| |
| +--------------------------------------------------------+ |
| |2014-12-27 21:30:45.887 | |
| +--------------------------------------------------------+ |
| |
| Example 2: Make the current timestamp without timezone. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.make_timestamp_ltz(df.year, df.month, 'day', df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +----------------------------------------------------+ |
| |make_timestamp_ltz(year, month, day, hour, min, sec)| |
| +----------------------------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +----------------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if timezone is not None: |
| return _invoke_function_over_columns( |
| "make_timestamp_ltz", years, months, days, hours, mins, secs, timezone |
| ) |
| else: |
| return _invoke_function_over_columns( |
| "make_timestamp_ltz", years, months, days, hours, mins, secs |
| ) |
| |
| |
| @_try_remote_functions |
| def try_make_timestamp_ltz( |
| years: "ColumnOrName", |
| months: "ColumnOrName", |
| days: "ColumnOrName", |
| hours: "ColumnOrName", |
| mins: "ColumnOrName", |
| secs: "ColumnOrName", |
| timezone: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Try to create the current timestamp with local time zone from years, months, days, hours, mins, |
| secs and timezone fields. |
| The function returns NULL on invalid inputs. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name |
| The year to represent, from 1 to 9999 |
| months : :class:`~pyspark.sql.Column` or column name |
| The month-of-year to represent, from 1 (January) to 12 (December) |
| days : :class:`~pyspark.sql.Column` or column name |
| The day-of-month to represent, from 1 to 31 |
| hours : :class:`~pyspark.sql.Column` or column name |
| The hour-of-day to represent, from 0 to 23 |
| mins : :class:`~pyspark.sql.Column` or column name |
| The minute-of-hour to represent, from 0 to 59 |
| secs : :class:`~pyspark.sql.Column` or column name |
| The second-of-minute and its micro-fraction to represent, from 0 to 60. |
| The value can be either an integer like 13 , or a fraction like 13.123. |
| If the sec argument equals to 60, the seconds field is set |
| to 0 and 1 minute is added to the final timestamp. |
| timezone : :class:`~pyspark.sql.Column` or column name, optional |
| The time zone identifier. For example, CET, UTC and etc. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a current timestamp, or NULL in case of an error. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp` |
| :meth:`pyspark.sql.functions.make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.try_make_timestamp` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.make_time` |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make the current timestamp from years, months, days, hours, mins and secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.try_make_timestamp_ltz('year', 'month', df.day, df.hour, df.min, df.sec, 'tz') |
| ... ).show(truncate=False) |
| +------------------------------------------------------------+ |
| |try_make_timestamp_ltz(year, month, day, hour, min, sec, tz)| |
| +------------------------------------------------------------+ |
| |2014-12-27 21:30:45.887 | |
| +------------------------------------------------------------+ |
| |
| Example 2: Make the current timestamp without timezone. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.try_make_timestamp_ltz('year', 'month', df.day, df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +--------------------------------------------------------+ |
| |try_make_timestamp_ltz(year, month, day, hour, min, sec)| |
| +--------------------------------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +--------------------------------------------------------+ |
| |
| Example 3: Make the current timestamp with invalid input. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 13, 28, 6, 30, 45.887, 'CET']], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec', 'tz']) |
| >>> df.select( |
| ... sf.try_make_timestamp_ltz('year', 'month', df.day, df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +--------------------------------------------------------+ |
| |try_make_timestamp_ltz(year, month, day, hour, min, sec)| |
| +--------------------------------------------------------+ |
| |NULL | |
| +--------------------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if timezone is not None: |
| return _invoke_function_over_columns( |
| "try_make_timestamp_ltz", years, months, days, hours, mins, secs, timezone |
| ) |
| else: |
| return _invoke_function_over_columns( |
| "try_make_timestamp_ltz", years, months, days, hours, mins, secs |
| ) |
| |
| |
| @overload |
| def make_timestamp_ntz( |
| years: "ColumnOrName", |
| months: "ColumnOrName", |
| days: "ColumnOrName", |
| hours: "ColumnOrName", |
| mins: "ColumnOrName", |
| secs: "ColumnOrName", |
| ) -> Column: |
| ... |
| |
| |
| @overload |
| def make_timestamp_ntz( |
| *, |
| date: "ColumnOrName", |
| time: "ColumnOrName", |
| ) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def make_timestamp_ntz( |
| years: Optional["ColumnOrName"] = None, |
| months: Optional["ColumnOrName"] = None, |
| days: Optional["ColumnOrName"] = None, |
| hours: Optional["ColumnOrName"] = None, |
| mins: Optional["ColumnOrName"] = None, |
| secs: Optional["ColumnOrName"] = None, |
| date: Optional["ColumnOrName"] = None, |
| time: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Create local date-time from years, months, days, hours, mins, secs fields. Alternatively, try to |
| create local date-time from date and time fields. If the configuration `spark.sql.ansi.enabled` |
| is false, the function returns NULL on invalid inputs. Otherwise, it will throw an error. |
| |
| .. versionadded:: 3.5.0 |
| |
| .. versionchanged:: 4.1.0 |
| Added support for creating timestamps from date and time. |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name, optional |
| The year to represent, from 1 to 9999. |
| Required when creating timestamps from individual components. |
| Must be used with months, days, hours, mins, and secs. |
| months : :class:`~pyspark.sql.Column` or column name, optional |
| The month-of-year to represent, from 1 (January) to 12 (December). |
| Required when creating timestamps from individual components. |
| Must be used with years, days, hours, mins, and secs. |
| days : :class:`~pyspark.sql.Column` or column name, optional |
| The day-of-month to represent, from 1 to 31. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, hours, mins, and secs. |
| hours : :class:`~pyspark.sql.Column` or column name, optional |
| The hour-of-day to represent, from 0 to 23. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, days, mins, and secs. |
| mins : :class:`~pyspark.sql.Column` or column name, optional |
| The minute-of-hour to represent, from 0 to 59. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, days, hours, and secs. |
| secs : :class:`~pyspark.sql.Column` or column name, optional |
| The second-of-minute and its micro-fraction to represent, from 0 to 60. |
| The value can be either an integer like 13, or a fraction like 13.123. |
| If the sec argument equals to 60, the seconds field is set |
| to 0 and 1 minute is added to the final timestamp. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, days, hours, and mins. |
| date : :class:`~pyspark.sql.Column` or column name, optional |
| The date to represent, in valid DATE format. |
| Required when creating timestamps from date and time components. |
| Must be used with time parameter only. |
| time : :class:`~pyspark.sql.Column` or column name, optional |
| The time to represent, in valid TIME format. |
| Required when creating timestamps from date and time components. |
| Must be used with date parameter only. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a local date-time. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp` |
| :meth:`pyspark.sql.functions.make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.try_make_timestamp` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.make_time` |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make local date-time from years, months, days, hours, mins, secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.make_timestamp_ntz('year', 'month', df.day, df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +----------------------------------------------------+ |
| |make_timestamp_ntz(year, month, day, hour, min, sec)| |
| +----------------------------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +----------------------------------------------------+ |
| |
| Example 2: Make local date-time from date and time. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> from datetime import date, time |
| >>> df = spark.range(1).select( |
| ... sf.lit(date(2014, 12, 28)).alias("date"), |
| ... sf.lit(time(6, 30, 45, 887000)).alias("time") |
| ... ) |
| >>> df.select(sf.make_timestamp_ntz(date=df.date, time=df.time)).show(truncate=False) |
| +------------------------------+ |
| |make_timestamp_ntz(date, time)| |
| +------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if years is not None: |
| if any(arg is not None for arg in [date, time]): |
| raise PySparkValueError( |
| errorClass="CANNOT_SET_TOGETHER", |
| messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, |
| ) |
| return _invoke_function_over_columns( |
| "make_timestamp_ntz", |
| cast("ColumnOrName", years), |
| cast("ColumnOrName", months), |
| cast("ColumnOrName", days), |
| cast("ColumnOrName", hours), |
| cast("ColumnOrName", mins), |
| cast("ColumnOrName", secs), |
| ) |
| else: |
| if any(arg is not None for arg in [years, months, days, hours, mins, secs]): |
| raise PySparkValueError( |
| errorClass="CANNOT_SET_TOGETHER", |
| messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, |
| ) |
| return _invoke_function_over_columns( |
| "make_timestamp_ntz", cast("ColumnOrName", date), cast("ColumnOrName", time) |
| ) |
| |
| |
| @overload |
| def try_make_timestamp_ntz( |
| years: "ColumnOrName", |
| months: "ColumnOrName", |
| days: "ColumnOrName", |
| hours: "ColumnOrName", |
| mins: "ColumnOrName", |
| secs: "ColumnOrName", |
| ) -> Column: |
| ... |
| |
| |
| @overload |
| def try_make_timestamp_ntz( |
| *, |
| date: "ColumnOrName", |
| time: "ColumnOrName", |
| ) -> Column: |
| ... |
| |
| |
| @_try_remote_functions |
| def try_make_timestamp_ntz( |
| years: Optional["ColumnOrName"] = None, |
| months: Optional["ColumnOrName"] = None, |
| days: Optional["ColumnOrName"] = None, |
| hours: Optional["ColumnOrName"] = None, |
| mins: Optional["ColumnOrName"] = None, |
| secs: Optional["ColumnOrName"] = None, |
| date: Optional["ColumnOrName"] = None, |
| time: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Try to create local date-time from years, months, days, hours, mins, secs fields. Alternatively, |
| try to create local date-time from date and time fields. The function returns NULL on invalid |
| inputs. |
| |
| .. versionadded:: 4.0.0 |
| |
| .. versionchanged:: 4.1.0 |
| Added support for creating timestamps from date and time. |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name, optional |
| The year to represent, from 1 to 9999. |
| Required when creating timestamps from individual components. |
| Must be used with months, days, hours, mins, and secs. |
| months : :class:`~pyspark.sql.Column` or column name, optional |
| The month-of-year to represent, from 1 (January) to 12 (December). |
| Required when creating timestamps from individual components. |
| Must be used with years, days, hours, mins, and secs. |
| days : :class:`~pyspark.sql.Column` or column name, optional |
| The day-of-month to represent, from 1 to 31. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, hours, mins, and secs. |
| hours : :class:`~pyspark.sql.Column` or column name, optional |
| The hour-of-day to represent, from 0 to 23. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, days, mins, and secs. |
| mins : :class:`~pyspark.sql.Column` or column name, optional |
| The minute-of-hour to represent, from 0 to 59. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, days, hours, and secs. |
| secs : :class:`~pyspark.sql.Column` or column name, optional |
| The second-of-minute and its micro-fraction to represent, from 0 to 60. |
| The value can be either an integer like 13, or a fraction like 13.123. |
| If the sec argument equals to 60, the seconds field is set |
| to 0 and 1 minute is added to the final timestamp. |
| Required when creating timestamps from individual components. |
| Must be used with years, months, days, hours, and mins. |
| date : :class:`~pyspark.sql.Column` or column name, optional |
| The date to represent, in valid DATE format. |
| Required when creating timestamps from date and time components. |
| Must be used with time parameter only. |
| time : :class:`~pyspark.sql.Column` or column name, optional |
| The time to represent, in valid TIME format. |
| Required when creating timestamps from date and time components. |
| Must be used with date parameter only. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a local date-time, or NULL in case of an error. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_timestamp` |
| :meth:`pyspark.sql.functions.make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.make_timestamp_ntz` |
| :meth:`pyspark.sql.functions.try_make_timestamp` |
| :meth:`pyspark.sql.functions.try_make_timestamp_ltz` |
| :meth:`pyspark.sql.functions.make_time` |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make local date-time from years, months, days, hours, mins, secs. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.try_make_timestamp_ntz('year', 'month', df.day, df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +--------------------------------------------------------+ |
| |try_make_timestamp_ntz(year, month, day, hour, min, sec)| |
| +--------------------------------------------------------+ |
| |2014-12-28 06:30:45.887 | |
| +--------------------------------------------------------+ |
| |
| Example 2: Make local date-time with invalid input |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 13, 28, 6, 30, 45.887]], |
| ... ['year', 'month', 'day', 'hour', 'min', 'sec']) |
| >>> df.select( |
| ... sf.try_make_timestamp_ntz('year', 'month', df.day, df.hour, df.min, df.sec) |
| ... ).show(truncate=False) |
| +--------------------------------------------------------+ |
| |try_make_timestamp_ntz(year, month, day, hour, min, sec)| |
| +--------------------------------------------------------+ |
| |NULL | |
| +--------------------------------------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| if years is not None: |
| if any(arg is not None for arg in [date, time]): |
| raise PySparkValueError( |
| errorClass="CANNOT_SET_TOGETHER", |
| messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, |
| ) |
| return _invoke_function_over_columns( |
| "try_make_timestamp_ntz", |
| cast("ColumnOrName", years), |
| cast("ColumnOrName", months), |
| cast("ColumnOrName", days), |
| cast("ColumnOrName", hours), |
| cast("ColumnOrName", mins), |
| cast("ColumnOrName", secs), |
| ) |
| else: |
| if any(arg is not None for arg in [years, months, days, hours, mins, secs]): |
| raise PySparkValueError( |
| errorClass="CANNOT_SET_TOGETHER", |
| messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, |
| ) |
| return _invoke_function_over_columns( |
| "try_make_timestamp_ntz", cast("ColumnOrName", date), cast("ColumnOrName", time) |
| ) |
| |
| |
| @_try_remote_functions |
| def make_ym_interval( |
| years: Optional["ColumnOrName"] = None, |
| months: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Make year-month interval from years, months. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| years : :class:`~pyspark.sql.Column` or column name, optional |
| The number of years, positive or negative |
| months : :class:`~pyspark.sql.Column` or column name, optional |
| The number of months, positive or negative |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a year-month interval. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.make_interval` |
| :meth:`pyspark.sql.functions.make_dt_interval` |
| :meth:`pyspark.sql.functions.try_make_interval` |
| |
| Examples |
| -------- |
| >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") |
| |
| Example 1: Make year-month interval from years, months. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12]], ['year', 'month']) |
| >>> df.select('*', sf.make_ym_interval('year', df.month)).show(truncate=False) |
| +----+-----+-------------------------------+ |
| |year|month|make_ym_interval(year, month) | |
| +----+-----+-------------------------------+ |
| |2014|12 |INTERVAL '2015-0' YEAR TO MONTH| |
| +----+-----+-------------------------------+ |
| |
| Example 2: Make year-month interval from years. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([[2014, 12]], ['year', 'month']) |
| >>> df.select('*', sf.make_ym_interval(df.year)).show(truncate=False) |
| +----+-----+-------------------------------+ |
| |year|month|make_ym_interval(year, 0) | |
| +----+-----+-------------------------------+ |
| |2014|12 |INTERVAL '2014-0' YEAR TO MONTH| |
| +----+-----+-------------------------------+ |
| |
| Example 3: Make empty interval. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.make_ym_interval()).show(truncate=False) |
| +----------------------------+ |
| |make_ym_interval(0, 0) | |
| +----------------------------+ |
| |INTERVAL '0-0' YEAR TO MONTH| |
| +----------------------------+ |
| |
| >>> spark.conf.unset("spark.sql.session.timeZone") |
| """ |
| _years = lit(0) if years is None else years |
| _months = lit(0) if months is None else months |
| return _invoke_function_over_columns("make_ym_interval", _years, _months) |
| |
| |
| @_try_remote_functions |
| def bucket(numBuckets: Union[Column, int], col: "ColumnOrName") -> Column: |
| """ |
| Partition transform function: A transform for any type that partitions |
| by a hash of the input column. |
| |
| .. versionadded:: 3.1.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. deprecated:: 4.0.0 |
| Use :func:`partitioning.bucket` instead. |
| |
| Examples |
| -------- |
| >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP |
| ... bucket(42, "ts") |
| ... ).createOrReplace() |
| |
| Parameters |
| ---------- |
| numBuckets : :class:`~pyspark.sql.Column` or int |
| the number of buckets |
| col : :class:`~pyspark.sql.Column` or str |
| target date or timestamp column to work on. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| data partitioned by given columns. |
| |
| Notes |
| ----- |
| This function can be used only in combination with |
| :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` |
| method of the `DataFrameWriterV2`. |
| |
| """ |
| from pyspark.sql.functions import partitioning |
| |
| warnings.warn("Deprecated in 4.0.0, use partitioning.bucket instead.", FutureWarning) |
| |
| return partitioning.bucket(numBuckets, col) |
| |
| |
| @_try_remote_functions |
| def call_udf(udfName: str, *cols: "ColumnOrName") -> Column: |
| """ |
| Call a user-defined function. |
| |
| .. versionadded:: 3.4.0 |
| |
| Parameters |
| ---------- |
| udfName : str |
| name of the user defined function (UDF) |
| cols : :class:`~pyspark.sql.Column` or str |
| column names or :class:`~pyspark.sql.Column`\\s to be used in the UDF |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| result of executed udf. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql.functions import call_udf, col |
| >>> from pyspark.sql.types import IntegerType, StringType |
| >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"]) |
| >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType()) |
| >>> df.select(call_udf("intX2", "id")).show() |
| +---------+ |
| |intX2(id)| |
| +---------+ |
| | 2| |
| | 4| |
| | 6| |
| +---------+ |
| >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType()) |
| >>> df.select(call_udf("strX2", col("name"))).show() |
| +-----------+ |
| |strX2(name)| |
| +-----------+ |
| | aa| |
| | bb| |
| | cc| |
| +-----------+ |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function("call_udf", udfName, _to_seq(sc, cols, _to_java_column)) |
| |
| |
| @_try_remote_functions |
| def call_function(funcName: str, *cols: "ColumnOrName") -> Column: |
| """ |
| Call a SQL function. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| funcName : str |
| function name that follows the SQL identifier syntax (can be quoted, can be qualified) |
| cols : :class:`~pyspark.sql.Column` or str |
| column names or :class:`~pyspark.sql.Column`\\s to be used in the function |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| result of executed function. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql.functions import call_udf, col |
| >>> from pyspark.sql.types import IntegerType, StringType |
| >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"]) |
| >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType()) |
| >>> df.select(call_function("intX2", "id")).show() |
| +---------+ |
| |intX2(id)| |
| +---------+ |
| | 2| |
| | 4| |
| | 6| |
| +---------+ |
| >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType()) |
| >>> df.select(call_function("strX2", col("name"))).show() |
| +-----------+ |
| |strX2(name)| |
| +-----------+ |
| | aa| |
| | bb| |
| | cc| |
| +-----------+ |
| >>> df.select(call_function("avg", col("id"))).show() |
| +-------+ |
| |avg(id)| |
| +-------+ |
| | 2.0| |
| +-------+ |
| >>> _ = spark.sql("CREATE FUNCTION custom_avg AS 'test.org.apache.spark.sql.MyDoubleAvg'") |
| ... # doctest: +SKIP |
| >>> df.select(call_function("custom_avg", col("id"))).show() |
| ... # doctest: +SKIP |
| +------------------------------------+ |
| |spark_catalog.default.custom_avg(id)| |
| +------------------------------------+ |
| | 102.0| |
| +------------------------------------+ |
| >>> df.select(call_function("spark_catalog.default.custom_avg", col("id"))).show() |
| ... # doctest: +SKIP |
| +------------------------------------+ |
| |spark_catalog.default.custom_avg(id)| |
| +------------------------------------+ |
| | 102.0| |
| +------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_seq, _to_java_column |
| |
| sc = _get_active_spark_context() |
| return _invoke_function("call_function", funcName, _to_seq(sc, cols, _to_java_column)) |
| |
| |
| @_try_remote_functions |
| def unwrap_udt(col: "ColumnOrName") -> Column: |
| """ |
| Unwrap UDT data type column into its underlying type. |
| |
| .. versionadded:: 3.4.0 |
| |
| Notes |
| ----- |
| Supports Spark Connect. |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("unwrap_udt", _to_java_column(col)) |
| |
| |
| @_try_remote_functions |
| def hll_sketch_agg( |
| col: "ColumnOrName", |
| lgConfigK: Optional[Union[int, Column]] = None, |
| ) -> Column: |
| """ |
| Aggregate function: returns the updatable binary representation of the Datasketches |
| HllSketch configured with lgConfigK arg. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| lgConfigK : :class:`~pyspark.sql.Column` or int, optional |
| The log-base-2 of K, where K is the number of buckets or slots for the HllSketch |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the HllSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.hll_union` |
| :meth:`pyspark.sql.functions.hll_union_agg` |
| :meth:`pyspark.sql.functions.hll_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df.agg(sf.hll_sketch_estimate(sf.hll_sketch_agg("value"))).show() |
| +----------------------------------------------+ |
| |hll_sketch_estimate(hll_sketch_agg(value, 12))| |
| +----------------------------------------------+ |
| | 3| |
| +----------------------------------------------+ |
| |
| >>> df.agg(sf.hll_sketch_estimate(sf.hll_sketch_agg("value", 12))).show() |
| +----------------------------------------------+ |
| |hll_sketch_estimate(hll_sketch_agg(value, 12))| |
| +----------------------------------------------+ |
| | 3| |
| +----------------------------------------------+ |
| """ |
| if lgConfigK is None: |
| return _invoke_function_over_columns("hll_sketch_agg", col) |
| else: |
| return _invoke_function_over_columns("hll_sketch_agg", col, lit(lgConfigK)) |
| |
| |
| @_try_remote_functions |
| def hll_union_agg( |
| col: "ColumnOrName", |
| allowDifferentLgConfigK: Optional[Union[bool, Column]] = None, |
| ) -> Column: |
| """ |
| Aggregate function: returns the updatable binary representation of the Datasketches |
| HllSketch, generated by merging previously created Datasketches HllSketch instances |
| via a Datasketches Union instance. Throws an exception if sketches have different |
| lgConfigK values and allowDifferentLgConfigK is unset or set to false. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| allowDifferentLgConfigK : :class:`~pyspark.sql.Column` or bool, optional |
| Allow sketches with different lgConfigK values to be merged (defaults to false). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the merged HllSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.hll_union` |
| :meth:`pyspark.sql.functions.hll_sketch_agg` |
| :meth:`pyspark.sql.functions.hll_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df1 = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df1 = df1.agg(sf.hll_sketch_agg("value").alias("sketch")) |
| >>> df2 = spark.createDataFrame([4,5,5,6], "INT") |
| >>> df2 = df2.agg(sf.hll_sketch_agg("value").alias("sketch")) |
| >>> df3 = df1.union(df2) |
| >>> df3.agg(sf.hll_sketch_estimate(sf.hll_union_agg("sketch"))).show() |
| +-------------------------------------------------+ |
| |hll_sketch_estimate(hll_union_agg(sketch, false))| |
| +-------------------------------------------------+ |
| | 6| |
| +-------------------------------------------------+ |
| |
| >>> df3.agg(sf.hll_sketch_estimate(sf.hll_union_agg("sketch", False))).show() |
| +-------------------------------------------------+ |
| |hll_sketch_estimate(hll_union_agg(sketch, false))| |
| +-------------------------------------------------+ |
| | 6| |
| +-------------------------------------------------+ |
| """ |
| if allowDifferentLgConfigK is None: |
| return _invoke_function_over_columns("hll_union_agg", col) |
| else: |
| return _invoke_function_over_columns("hll_union_agg", col, lit(allowDifferentLgConfigK)) |
| |
| |
| @_try_remote_functions |
| def hll_sketch_estimate(col: "ColumnOrName") -> Column: |
| """ |
| Returns the estimated number of unique values given the binary representation |
| of a Datasketches HllSketch. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The estimated number of unique values for the HllSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.hll_union` |
| :meth:`pyspark.sql.functions.hll_union_agg` |
| :meth:`pyspark.sql.functions.hll_sketch_agg` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df.agg(sf.hll_sketch_estimate(sf.hll_sketch_agg("value"))).show() |
| +----------------------------------------------+ |
| |hll_sketch_estimate(hll_sketch_agg(value, 12))| |
| +----------------------------------------------+ |
| | 3| |
| +----------------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| return _invoke_function("hll_sketch_estimate", _to_java_column(col)) |
| |
| |
| @_try_remote_functions |
| def hll_union( |
| col1: "ColumnOrName", col2: "ColumnOrName", allowDifferentLgConfigK: Optional[bool] = None |
| ) -> Column: |
| """ |
| Merges two binary representations of Datasketches HllSketch objects, using a |
| Datasketches Union object. Throws an exception if sketches have different |
| lgConfigK values and allowDifferentLgConfigK is unset or set to false. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| allowDifferentLgConfigK : bool, optional |
| Allow sketches with different lgConfigK values to be merged (defaults to false). |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the merged HllSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.hll_union_agg` |
| :meth:`pyspark.sql.functions.hll_sketch_agg` |
| :meth:`pyspark.sql.functions.hll_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>") |
| >>> df = df.agg( |
| ... sf.hll_sketch_agg("v1").alias("sketch1"), |
| ... sf.hll_sketch_agg("v2").alias("sketch2") |
| ... ) |
| >>> df.select(sf.hll_sketch_estimate(sf.hll_union(df.sketch1, "sketch2"))).show() |
| +-------------------------------------------------------+ |
| |hll_sketch_estimate(hll_union(sketch1, sketch2, false))| |
| +-------------------------------------------------------+ |
| | 6| |
| +-------------------------------------------------------+ |
| """ |
| from pyspark.sql.classic.column import _to_java_column |
| |
| if allowDifferentLgConfigK is not None: |
| return _invoke_function( |
| "hll_union", |
| _to_java_column(col1), |
| _to_java_column(col2), |
| _enum_to_value(allowDifferentLgConfigK), |
| ) |
| else: |
| return _invoke_function("hll_union", _to_java_column(col1), _to_java_column(col2)) |
| |
| |
| @_try_remote_functions |
| def theta_sketch_agg( |
| col: "ColumnOrName", |
| lgNomEntries: Optional[Union[int, Column]] = None, |
| ) -> Column: |
| """ |
| Aggregate function: returns the compact binary representation of the Datasketches |
| ThetaSketch with the values in the input column configured with lgNomEntries nominal entries. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| lgNomEntries : :class:`~pyspark.sql.Column` or int, optional |
| The log-base-2 of nominal entries, where nominal entries is the size of the sketch |
| (must be between 4 and 26, defaults to 12) |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_union` |
| :meth:`pyspark.sql.functions.theta_intersection` |
| :meth:`pyspark.sql.functions.theta_difference` |
| :meth:`pyspark.sql.functions.theta_union_agg` |
| :meth:`pyspark.sql.functions.theta_intersection_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show() |
| +--------------------------------------------------+ |
| |theta_sketch_estimate(theta_sketch_agg(value, 12))| |
| +--------------------------------------------------+ |
| | 3| |
| +--------------------------------------------------+ |
| |
| >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value", 15))).show() |
| +--------------------------------------------------+ |
| |theta_sketch_estimate(theta_sketch_agg(value, 15))| |
| +--------------------------------------------------+ |
| | 3| |
| +--------------------------------------------------+ |
| """ |
| fn = "theta_sketch_agg" |
| if lgNomEntries is None: |
| return _invoke_function_over_columns(fn, col) |
| else: |
| return _invoke_function_over_columns(fn, col, lit(lgNomEntries)) |
| |
| |
| @_try_remote_functions |
| def theta_union_agg( |
| col: "ColumnOrName", |
| lgNomEntries: Optional[Union[int, Column]] = None, |
| ) -> Column: |
| """ |
| Aggregate function: returns the compact binary representation of the Datasketches |
| ThetaSketch that is the union of the Theta sketches in the input column. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| lgNomEntries : :class:`~pyspark.sql.Column` or int, optional |
| The log-base-2 of nominal entries for the union operation |
| (must be between 4 and 26, defaults to 12) |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the merged ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_union` |
| :meth:`pyspark.sql.functions.theta_sketch_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df1 = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df1 = df1.agg(sf.theta_sketch_agg("value").alias("sketch")) |
| >>> df2 = spark.createDataFrame([4,5,5,6], "INT") |
| >>> df2 = df2.agg(sf.theta_sketch_agg("value").alias("sketch")) |
| >>> df3 = df1.union(df2) |
| >>> df3.agg(sf.theta_sketch_estimate(sf.theta_union_agg("sketch"))).show() |
| +--------------------------------------------------+ |
| |theta_sketch_estimate(theta_union_agg(sketch, 12))| |
| +--------------------------------------------------+ |
| | 6| |
| +--------------------------------------------------+ |
| """ |
| fn = "theta_union_agg" |
| if lgNomEntries is None: |
| return _invoke_function_over_columns(fn, col) |
| else: |
| return _invoke_function_over_columns(fn, col, lit(lgNomEntries)) |
| |
| |
| @_try_remote_functions |
| def theta_intersection_agg(col: "ColumnOrName") -> Column: |
| """ |
| Aggregate function: returns the compact binary representation of the Datasketches |
| ThetaSketch that is the intersection of the Theta sketches in the input column |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the intersected ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_intersection` |
| :meth:`pyspark.sql.functions.theta_sketch_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df1 = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df1 = df1.agg(sf.theta_sketch_agg("value").alias("sketch")) |
| >>> df2 = spark.createDataFrame([2,3,3,4], "INT") |
| >>> df2 = df2.agg(sf.theta_sketch_agg("value").alias("sketch")) |
| >>> df3 = df1.union(df2) |
| >>> df3.agg(sf.theta_sketch_estimate(sf.theta_intersection_agg("sketch"))).show() |
| +-----------------------------------------------------+ |
| |theta_sketch_estimate(theta_intersection_agg(sketch))| |
| +-----------------------------------------------------+ |
| | 2| |
| +-----------------------------------------------------+ |
| """ |
| fn = "theta_intersection_agg" |
| return _invoke_function_over_columns(fn, col) |
| |
| |
| @_try_remote_functions |
| def theta_sketch_estimate(col: "ColumnOrName") -> Column: |
| """ |
| Returns the estimated number of unique values given the binary representation |
| of a Datasketches ThetaSketch. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The estimated number of unique values for the ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_union` |
| :meth:`pyspark.sql.functions.theta_intersection` |
| :meth:`pyspark.sql.functions.theta_difference` |
| :meth:`pyspark.sql.functions.theta_union_agg` |
| :meth:`pyspark.sql.functions.theta_intersection_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_agg` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([1,2,2,3], "INT") |
| >>> df.agg(sf.theta_sketch_estimate(sf.theta_sketch_agg("value"))).show() |
| +--------------------------------------------------+ |
| |theta_sketch_estimate(theta_sketch_agg(value, 12))| |
| +--------------------------------------------------+ |
| | 3| |
| +--------------------------------------------------+ |
| """ |
| |
| fn = "theta_sketch_estimate" |
| return _invoke_function_over_columns(fn, col) |
| |
| |
| @_try_remote_functions |
| def theta_union( |
| col1: "ColumnOrName", col2: "ColumnOrName", lgNomEntries: Optional[Union[int, Column]] = None |
| ) -> Column: |
| """ |
| Merges two binary representations of Datasketches ThetaSketch objects, using a |
| Datasketches Union object. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| lgNomEntries : :class:`~pyspark.sql.Column` or int, optional |
| The log-base-2 of nominal entries for the union operation |
| (must be between 4 and 26, defaults to 12) |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the merged ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_union_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>") |
| >>> df = df.agg( |
| ... sf.theta_sketch_agg("v1").alias("sketch1"), |
| ... sf.theta_sketch_agg("v2").alias("sketch2") |
| ... ) |
| >>> df.select(sf.theta_sketch_estimate(sf.theta_union(df.sketch1, "sketch2"))).show() |
| +--------------------------------------------------------+ |
| |theta_sketch_estimate(theta_union(sketch1, sketch2, 12))| |
| +--------------------------------------------------------+ |
| | 6| |
| +--------------------------------------------------------+ |
| """ |
| |
| fn = "theta_union" |
| if lgNomEntries is not None: |
| return _invoke_function_over_columns( |
| fn, |
| col1, |
| col2, |
| lit(lgNomEntries), |
| ) |
| else: |
| return _invoke_function_over_columns(fn, col1, col2) |
| |
| |
| @_try_remote_functions |
| def theta_intersection(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Returns the intersection of two binary representations of Datasketches ThetaSketch |
| objects, using a Datasketches Intersection object. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the intersected ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_intersection_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,1),(2,2),(3,2),(3,3)], "struct<v1:int,v2:int>") |
| >>> df = df.agg( |
| ... sf.theta_sketch_agg("v1").alias("sketch1"), |
| ... sf.theta_sketch_agg("v2").alias("sketch2") |
| ... ) |
| >>> df.select(sf.theta_sketch_estimate(sf.theta_intersection(df.sketch1, "sketch2"))).show() |
| +-----------------------------------------------------------+ |
| |theta_sketch_estimate(theta_intersection(sketch1, sketch2))| |
| +-----------------------------------------------------------+ |
| | 3| |
| +-----------------------------------------------------------+ |
| """ |
| |
| fn = "theta_intersection" |
| return _invoke_function_over_columns(fn, col1, col2) |
| |
| |
| @_try_remote_functions |
| def theta_difference(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Returns the set difference of two binary representations of Datasketches ThetaSketch |
| objects (elements in first sketch but not in second), using a Datasketches ANotB object. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| The binary representation of the difference ThetaSketch. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.theta_union` |
| :meth:`pyspark.sql.functions.theta_intersection` |
| :meth:`pyspark.sql.functions.theta_sketch_agg` |
| :meth:`pyspark.sql.functions.theta_sketch_estimate` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,4),(2,4),(3,5),(4,5)], "struct<v1:int,v2:int>") |
| >>> df = df.agg( |
| ... sf.theta_sketch_agg("v1").alias("sketch1"), |
| ... sf.theta_sketch_agg("v2").alias("sketch2") |
| ... ) |
| >>> df.select(sf.theta_sketch_estimate(sf.theta_difference(df.sketch1, "sketch2"))).show() |
| +---------------------------------------------------------+ |
| |theta_sketch_estimate(theta_difference(sketch1, sketch2))| |
| +---------------------------------------------------------+ |
| | 3| |
| +---------------------------------------------------------+ |
| """ |
| |
| fn = "theta_difference" |
| return _invoke_function_over_columns(fn, col1, col2) |
| |
| |
| # ---------------------- Predicates functions ------------------------------ |
| |
| |
| @_try_remote_functions |
| def ifnull(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Returns `col2` if `col1` is null, or `col1` otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or str |
| col2 : :class:`~pyspark.sql.Column` or str |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(None,), (1,)], ["e"]) |
| >>> df.select(sf.ifnull(df.e, sf.lit(8))).show() |
| +------------+ |
| |ifnull(e, 8)| |
| +------------+ |
| | 8| |
| | 1| |
| +------------+ |
| """ |
| return _invoke_function_over_columns("ifnull", col1, col2) |
| |
| |
| @_try_remote_functions |
| def isnotnull(col: "ColumnOrName") -> Column: |
| """ |
| Returns true if `col` is not null, or false otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None,), (1,)], ["e"]) |
| >>> df.select('*', sf.isnotnull(df.e)).show() |
| +----+---------------+ |
| | e|(e IS NOT NULL)| |
| +----+---------------+ |
| |NULL| false| |
| | 1| true| |
| +----+---------------+ |
| |
| >>> df.select('*', sf.isnotnull('e')).show() |
| +----+---------------+ |
| | e|(e IS NOT NULL)| |
| +----+---------------+ |
| |NULL| false| |
| | 1| true| |
| +----+---------------+ |
| """ |
| return _invoke_function_over_columns("isnotnull", col) |
| |
| |
| @_try_remote_functions |
| def equal_null(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Returns same result as the EQUAL(=) operator for non-null operands, |
| but returns true if both are null, false if one of them is null. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(None, None,), (1, 9,)], ["a", "b"]) |
| >>> df.select('*', sf.equal_null(df.a, df.b)).show() |
| +----+----+----------------+ |
| | a| b|equal_null(a, b)| |
| +----+----+----------------+ |
| |NULL|NULL| true| |
| | 1| 9| false| |
| +----+----+----------------+ |
| |
| >>> df.select('*', sf.equal_null('a', 'b')).show() |
| +----+----+----------------+ |
| | a| b|equal_null(a, b)| |
| +----+----+----------------+ |
| |NULL|NULL| true| |
| | 1| 9| false| |
| +----+----+----------------+ |
| """ |
| return _invoke_function_over_columns("equal_null", col1, col2) |
| |
| |
| @_try_remote_functions |
| def nullif(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Returns null if `col1` equals to `col2`, or `col1` otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(None, None,), (1, 9,)], ["a", "b"]) |
| >>> df.select('*', sf.nullif(df.a, df.b)).show() |
| +----+----+------------+ |
| | a| b|nullif(a, b)| |
| +----+----+------------+ |
| |NULL|NULL| NULL| |
| | 1| 9| 1| |
| +----+----+------------+ |
| |
| >>> df.select('*', sf.nullif('a', 'b')).show() |
| +----+----+------------+ |
| | a| b|nullif(a, b)| |
| +----+----+------------+ |
| |NULL|NULL| NULL| |
| | 1| 9| 1| |
| +----+----+------------+ |
| """ |
| return _invoke_function_over_columns("nullif", col1, col2) |
| |
| |
| @_try_remote_functions |
| def nullifzero(col: "ColumnOrName") -> Column: |
| """ |
| Returns null if `col` is equal to zero, or `col` otherwise. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(0,), (1,)], ["a"]) |
| >>> df.select('*', sf.nullifzero(df.a)).show() |
| +---+-------------+ |
| | a|nullifzero(a)| |
| +---+-------------+ |
| | 0| NULL| |
| | 1| 1| |
| +---+-------------+ |
| |
| >>> df.select('*', sf.nullifzero('a')).show() |
| +---+-------------+ |
| | a|nullifzero(a)| |
| +---+-------------+ |
| | 0| NULL| |
| | 1| 1| |
| +---+-------------+ |
| """ |
| return _invoke_function_over_columns("nullifzero", col) |
| |
| |
| @_try_remote_functions |
| def nvl(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: |
| """ |
| Returns `col2` if `col1` is null, or `col1` otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(None, 8,), (1, 9,)], ["a", "b"]) |
| >>> df.select('*', sf.nvl(df.a, df.b)).show() |
| +----+---+---------+ |
| | a| b|nvl(a, b)| |
| +----+---+---------+ |
| |NULL| 8| 8| |
| | 1| 9| 1| |
| +----+---+---------+ |
| |
| >>> df.select('*', sf.nvl('a', 'b')).show() |
| +----+---+---------+ |
| | a| b|nvl(a, b)| |
| +----+---+---------+ |
| |NULL| 8| 8| |
| | 1| 9| 1| |
| +----+---+---------+ |
| """ |
| return _invoke_function_over_columns("nvl", col1, col2) |
| |
| |
| @_try_remote_functions |
| def nvl2(col1: "ColumnOrName", col2: "ColumnOrName", col3: "ColumnOrName") -> Column: |
| """ |
| Returns `col2` if `col1` is not null, or `col3` otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col1 : :class:`~pyspark.sql.Column` or column name |
| col2 : :class:`~pyspark.sql.Column` or column name |
| col3 : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(None, 8, 6,), (1, 9, 9,)], ["a", "b", "c"]) |
| >>> df.select('*', sf.nvl2(df.a, df.b, df.c)).show() |
| +----+---+---+-------------+ |
| | a| b| c|nvl2(a, b, c)| |
| +----+---+---+-------------+ |
| |NULL| 8| 6| 6| |
| | 1| 9| 9| 9| |
| +----+---+---+-------------+ |
| |
| >>> df.select('*', sf.nvl2('a', 'b', 'c')).show() |
| +----+---+---+-------------+ |
| | a| b| c|nvl2(a, b, c)| |
| +----+---+---+-------------+ |
| |NULL| 8| 6| 6| |
| | 1| 9| 9| 9| |
| +----+---+---+-------------+ |
| """ |
| return _invoke_function_over_columns("nvl2", col1, col2, col3) |
| |
| |
| @_try_remote_functions |
| def zeroifnull(col: "ColumnOrName") -> Column: |
| """ |
| Returns zero if `col` is null, or `col` otherwise. |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([(None,), (1,)], ["a"]) |
| >>> df.select('*', sf.zeroifnull(df.a)).show() |
| +----+-------------+ |
| | a|zeroifnull(a)| |
| +----+-------------+ |
| |NULL| 0| |
| | 1| 1| |
| +----+-------------+ |
| |
| >>> df.select('*', sf.zeroifnull('a')).show() |
| +----+-------------+ |
| | a|zeroifnull(a)| |
| +----+-------------+ |
| |NULL| 0| |
| | 1| 1| |
| +----+-------------+ |
| """ |
| return _invoke_function_over_columns("zeroifnull", col) |
| |
| |
| @_try_remote_functions |
| def aes_encrypt( |
| input: "ColumnOrName", |
| key: "ColumnOrName", |
| mode: Optional["ColumnOrName"] = None, |
| padding: Optional["ColumnOrName"] = None, |
| iv: Optional["ColumnOrName"] = None, |
| aad: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Returns an encrypted value of `input` using AES in given `mode` with the specified `padding`. |
| Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (`mode`, |
| `padding`) are ('ECB', 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional initialization |
| vectors (IVs) are only supported for CBC and GCM modes. These must be 16 bytes for CBC and 12 |
| bytes for GCM. If not provided, a random vector will be generated and prepended to the |
| output. Optional additional authenticated data (AAD) is only supported for GCM. If provided |
| for encryption, the identical AAD value must be provided for decryption. The default mode is |
| GCM. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| input : :class:`~pyspark.sql.Column` or column name |
| The binary value to encrypt. |
| key : :class:`~pyspark.sql.Column` or column name |
| The passphrase to use to encrypt the data. |
| mode : :class:`~pyspark.sql.Column` or str, optional |
| Specifies which block cipher mode should be used to encrypt messages. Valid modes: ECB, |
| GCM, CBC. |
| padding : :class:`~pyspark.sql.Column` or column name, optional |
| Specifies how to pad messages whose length is not a multiple of the block size. Valid |
| values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS |
| for CBC. |
| iv : :class:`~pyspark.sql.Column` or column name, optional |
| Optional initialization vector. Only supported for CBC and GCM modes. Valid values: None or |
| "". 16-byte array for CBC mode. 12-byte array for GCM mode. |
| aad : :class:`~pyspark.sql.Column` or column name, optional |
| Optional additional authenticated data. Only supported for GCM mode. This can be any |
| free-form input and must be provided for both encryption and decryption. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains an encrypted value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.aes_decrypt` |
| :meth:`pyspark.sql.functions.try_aes_decrypt` |
| |
| Examples |
| -------- |
| |
| Example 1: Encrypt data with key, mode, padding, iv and aad. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", |
| ... "000000000000000000000000", "This is an AAD mixed into the input",)], |
| ... ["input", "key", "mode", "padding", "iv", "aad"] |
| ... ) |
| >>> df.select(sf.base64(sf.aes_encrypt( |
| ... df.input, df.key, "mode", df.padding, sf.to_binary(df.iv, sf.lit("hex")), df.aad) |
| ... )).show(truncate=False) |
| +-----------------------------------------------------------------------+ |
| |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), aad))| |
| +-----------------------------------------------------------------------+ |
| |AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4 | |
| +-----------------------------------------------------------------------+ |
| |
| Example 2: Encrypt data with key, mode, padding and iv. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", |
| ... "000000000000000000000000", "This is an AAD mixed into the input",)], |
| ... ["input", "key", "mode", "padding", "iv", "aad"] |
| ... ) |
| >>> df.select(sf.base64(sf.aes_encrypt( |
| ... df.input, df.key, "mode", df.padding, sf.to_binary(df.iv, sf.lit("hex"))) |
| ... )).show(truncate=False) |
| +--------------------------------------------------------------------+ |
| |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), ))| |
| +--------------------------------------------------------------------+ |
| |AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f | |
| +--------------------------------------------------------------------+ |
| |
| Example 3: Encrypt data with key, mode and padding. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)], |
| ... ["input", "key", "mode", "padding"] |
| ... ) |
| >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, "mode", df.padding), |
| ... df.key, df.mode, df.padding |
| ... ).cast("STRING")).show(truncate=False) |
| +---------------------------------------------------------------------------------------------+ |
| |CAST(aes_decrypt(aes_encrypt(input, key, mode, padding, , ), key, mode, padding, ) AS STRING)| |
| +---------------------------------------------------------------------------------------------+ |
| |Spark SQL | |
| +---------------------------------------------------------------------------------------------+ |
| |
| Example 4: Encrypt data with key and mode. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "Spark SQL", "0000111122223333", "ECB",)], |
| ... ["input", "key", "mode"] |
| ... ) |
| >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, "mode"), |
| ... df.key, df.mode |
| ... ).cast("STRING")).show(truncate=False) |
| +---------------------------------------------------------------------------------------------+ |
| |CAST(aes_decrypt(aes_encrypt(input, key, mode, DEFAULT, , ), key, mode, DEFAULT, ) AS STRING)| |
| +---------------------------------------------------------------------------------------------+ |
| |Spark SQL | |
| +---------------------------------------------------------------------------------------------+ |
| |
| Example 5: Encrypt data with key. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "Spark SQL", "abcdefghijklmnop",)], |
| ... ["input", "key"] |
| ... ) |
| >>> df.select(sf.aes_decrypt( |
| ... sf.unbase64(sf.base64(sf.aes_encrypt(df.input, df.key))), df.key |
| ... ).cast("STRING")).show(truncate=False) |
| +-------------------------------------------------------------------------------------------------------------+ |
| |CAST(aes_decrypt(unbase64(base64(aes_encrypt(input, key, GCM, DEFAULT, , ))), key, GCM, DEFAULT, ) AS STRING)| |
| +-------------------------------------------------------------------------------------------------------------+ |
| |Spark SQL | |
| +-------------------------------------------------------------------------------------------------------------+ |
| """ # noqa: E501 |
| _mode = lit("GCM") if mode is None else mode |
| _padding = lit("DEFAULT") if padding is None else padding |
| _iv = lit("") if iv is None else iv |
| _aad = lit("") if aad is None else aad |
| return _invoke_function_over_columns("aes_encrypt", input, key, _mode, _padding, _iv, _aad) |
| |
| |
| @_try_remote_functions |
| def aes_decrypt( |
| input: "ColumnOrName", |
| key: "ColumnOrName", |
| mode: Optional["ColumnOrName"] = None, |
| padding: Optional["ColumnOrName"] = None, |
| aad: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16, |
| 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB', |
| 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional additional authenticated data (AAD) is |
| only supported for GCM. If provided for encryption, the identical AAD value must be provided |
| for decryption. The default mode is GCM. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| input : :class:`~pyspark.sql.Column` or column name |
| The binary value to decrypt. |
| key : :class:`~pyspark.sql.Column` or column name |
| The passphrase to use to decrypt the data. |
| mode : :class:`~pyspark.sql.Column` or column name, optional |
| Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB, |
| GCM, CBC. |
| padding : :class:`~pyspark.sql.Column` or column name, optional |
| Specifies how to pad messages whose length is not a multiple of the block size. Valid |
| values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS |
| for CBC. |
| aad : :class:`~pyspark.sql.Column` or column name, optional |
| Optional additional authenticated data. Only supported for GCM mode. This can be any |
| free-form input and must be provided for both encryption and decryption. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a decrypted value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.aes_encrypt` |
| :meth:`pyspark.sql.functions.try_aes_decrypt` |
| |
| Examples |
| -------- |
| |
| Example 1: Decrypt data with key, mode, padding and aad. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", |
| ... "This is an AAD mixed into the input",)], |
| ... ["input", "key", "mode", "padding", "aad"] |
| ... ) |
| >>> df.select(sf.aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad |
| ... ).cast("STRING")).show(truncate=False) |
| +---------------------------------------------------------------------+ |
| |CAST(aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)| |
| +---------------------------------------------------------------------+ |
| |Spark | |
| +---------------------------------------------------------------------+ |
| |
| Example 2: Decrypt data with key, mode and padding. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)], |
| ... ["input", "key", "mode", "padding"] |
| ... ) |
| >>> df.select(sf.aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode", df.padding |
| ... ).cast("STRING")).show(truncate=False) |
| +------------------------------------------------------------------+ |
| |CAST(aes_decrypt(unbase64(input), key, mode, padding, ) AS STRING)| |
| +------------------------------------------------------------------+ |
| |Spark | |
| +------------------------------------------------------------------+ |
| |
| Example 3: Decrypt data with key and mode. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)], |
| ... ["input", "key", "mode", "padding"] |
| ... ) |
| >>> df.select(sf.aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode" |
| ... ).cast("STRING")).show(truncate=False) |
| +------------------------------------------------------------------+ |
| |CAST(aes_decrypt(unbase64(input), key, mode, DEFAULT, ) AS STRING)| |
| +------------------------------------------------------------------+ |
| |Spark | |
| +------------------------------------------------------------------+ |
| |
| Example 4: Decrypt data with key. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94", |
| ... "0000111122223333",)], |
| ... ["input", "key"] |
| ... ) |
| >>> df.select(sf.aes_decrypt( |
| ... sf.unhex(df.input), df.key |
| ... ).cast("STRING")).show(truncate=False) |
| +--------------------------------------------------------------+ |
| |CAST(aes_decrypt(unhex(input), key, GCM, DEFAULT, ) AS STRING)| |
| +--------------------------------------------------------------+ |
| |Spark | |
| +--------------------------------------------------------------+ |
| """ |
| _mode = lit("GCM") if mode is None else mode |
| _padding = lit("DEFAULT") if padding is None else padding |
| _aad = lit("") if aad is None else aad |
| return _invoke_function_over_columns("aes_decrypt", input, key, _mode, _padding, _aad) |
| |
| |
| @_try_remote_functions |
| def try_aes_decrypt( |
| input: "ColumnOrName", |
| key: "ColumnOrName", |
| mode: Optional["ColumnOrName"] = None, |
| padding: Optional["ColumnOrName"] = None, |
| aad: Optional["ColumnOrName"] = None, |
| ) -> Column: |
| """ |
| This is a special version of `aes_decrypt` that performs the same operation, |
| but returns a NULL value instead of raising an error if the decryption cannot be performed. |
| Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16, |
| 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB', |
| 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional additional authenticated data (AAD) is |
| only supported for GCM. If provided for encryption, the identical AAD value must be provided |
| for decryption. The default mode is GCM. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| input : :class:`~pyspark.sql.Column` or column name |
| The binary value to decrypt. |
| key : :class:`~pyspark.sql.Column` or column name |
| The passphrase to use to decrypt the data. |
| mode : :class:`~pyspark.sql.Column` or column name, optional |
| Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB, |
| GCM, CBC. |
| padding : :class:`~pyspark.sql.Column` or column name, optional |
| Specifies how to pad messages whose length is not a multiple of the block size. Valid |
| values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS |
| for CBC. |
| aad : :class:`~pyspark.sql.Column` or column name, optional |
| Optional additional authenticated data. Only supported for GCM mode. This can be any |
| free-form input and must be provided for both encryption and decryption. |
| |
| Returns |
| ------- |
| :class:`~pyspark.sql.Column` |
| A new column that contains a decrypted value or a NULL value. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.aes_encrypt` |
| :meth:`pyspark.sql.functions.aes_decrypt` |
| |
| Examples |
| -------- |
| |
| Example 1: Decrypt data with key, mode, padding and aad. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", |
| ... "This is an AAD mixed into the input",)], |
| ... ["input", "key", "mode", "padding", "aad"] |
| ... ) |
| >>> df.select(sf.try_aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad |
| ... ).cast("STRING")).show(truncate=False) |
| +-------------------------------------------------------------------------+ |
| |CAST(try_aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)| |
| +-------------------------------------------------------------------------+ |
| |Spark | |
| +-------------------------------------------------------------------------+ |
| |
| Example 2: Failed to decrypt data with key, mode, padding and aad. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT", |
| ... "This is an AAD mixed into the input",)], |
| ... ["input", "key", "mode", "padding", "aad"] |
| ... ) |
| >>> df.select(sf.try_aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad |
| ... ).cast("STRING")).show(truncate=False) |
| +-------------------------------------------------------------------------+ |
| |CAST(try_aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)| |
| +-------------------------------------------------------------------------+ |
| |NULL | |
| +-------------------------------------------------------------------------+ |
| |
| Example 3: Decrypt data with key, mode and padding. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)], |
| ... ["input", "key", "mode", "padding"] |
| ... ) |
| >>> df.select(sf.try_aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode", df.padding |
| ... ).cast("STRING")).show(truncate=False) |
| +----------------------------------------------------------------------+ |
| |CAST(try_aes_decrypt(unbase64(input), key, mode, padding, ) AS STRING)| |
| +----------------------------------------------------------------------+ |
| |Spark | |
| +----------------------------------------------------------------------+ |
| |
| Example 4: Decrypt data with key and mode. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=", |
| ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)], |
| ... ["input", "key", "mode", "padding"] |
| ... ) |
| >>> df.select(sf.try_aes_decrypt( |
| ... sf.unbase64(df.input), df.key, "mode" |
| ... ).cast("STRING")).show(truncate=False) |
| +----------------------------------------------------------------------+ |
| |CAST(try_aes_decrypt(unbase64(input), key, mode, DEFAULT, ) AS STRING)| |
| +----------------------------------------------------------------------+ |
| |Spark | |
| +----------------------------------------------------------------------+ |
| |
| Example 5: Decrypt data with key. |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([( |
| ... "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94", |
| ... "0000111122223333",)], |
| ... ["input", "key"] |
| ... ) |
| >>> df.select(sf.try_aes_decrypt( |
| ... sf.unhex(df.input), df.key |
| ... ).cast("STRING")).show(truncate=False) |
| +------------------------------------------------------------------+ |
| |CAST(try_aes_decrypt(unhex(input), key, GCM, DEFAULT, ) AS STRING)| |
| +------------------------------------------------------------------+ |
| |Spark | |
| +------------------------------------------------------------------+ |
| """ |
| _mode = lit("GCM") if mode is None else mode |
| _padding = lit("DEFAULT") if padding is None else padding |
| _aad = lit("") if aad is None else aad |
| return _invoke_function_over_columns("try_aes_decrypt", input, key, _mode, _padding, _aad) |
| |
| |
| @_try_remote_functions |
| def sha(col: "ColumnOrName") -> Column: |
| """ |
| Returns a sha1 hash value as a hex string of the `col`. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.sha1` |
| :meth:`pyspark.sql.functions.sha2` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select(sf.sha(sf.lit("Spark"))).show() |
| +--------------------+ |
| | sha(Spark)| |
| +--------------------+ |
| |85f5955f4b27a9a4c...| |
| +--------------------+ |
| """ |
| return _invoke_function_over_columns("sha", col) |
| |
| |
| @_try_remote_functions |
| def input_file_block_length() -> Column: |
| """ |
| Returns the length of the block being read, or -1 if not available. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.input_file_name` |
| :meth:`pyspark.sql.functions.input_file_block_start` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.read.text("python/test_support/sql/ages_newlines.csv", lineSep=",") |
| >>> df.select(sf.input_file_block_length()).show() |
| +-------------------------+ |
| |input_file_block_length()| |
| +-------------------------+ |
| | 87| |
| | 87| |
| | 87| |
| | 87| |
| | 87| |
| | 87| |
| | 87| |
| | 87| |
| +-------------------------+ |
| """ |
| return _invoke_function_over_columns("input_file_block_length") |
| |
| |
| @_try_remote_functions |
| def input_file_block_start() -> Column: |
| """ |
| Returns the start offset of the block being read, or -1 if not available. |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.input_file_name` |
| :meth:`pyspark.sql.functions.input_file_block_length` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.read.text("python/test_support/sql/ages_newlines.csv", lineSep=",") |
| >>> df.select(sf.input_file_block_start()).show() |
| +------------------------+ |
| |input_file_block_start()| |
| +------------------------+ |
| | 0| |
| | 0| |
| | 0| |
| | 0| |
| | 0| |
| | 0| |
| | 0| |
| | 0| |
| +------------------------+ |
| """ |
| return _invoke_function_over_columns("input_file_block_start") |
| |
| |
| @_try_remote_functions |
| def reflect(*cols: "ColumnOrName") -> Column: |
| """ |
| Calls a method with reflection. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| the first element should be a Column representing literal string for the class name, |
| and the second element should be a Column representing literal string for the method name, |
| and the remaining are input arguments (Columns or column names) to the Java method. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.java_method` |
| :meth:`pyspark.sql.functions.try_reflect` |
| |
| Examples |
| -------- |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('a5cf6c42-0c85-418f-af6c-3e4e5b1328f2',)], ['a']) |
| >>> df.select( |
| ... sf.reflect(sf.lit('java.util.UUID'), sf.lit('fromString'), 'a') |
| ... ).show(truncate=False) |
| +--------------------------------------+ |
| |reflect(java.util.UUID, fromString, a)| |
| +--------------------------------------+ |
| |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | |
| +--------------------------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("reflect", cols) |
| |
| |
| @_try_remote_functions |
| def java_method(*cols: "ColumnOrName") -> Column: |
| """ |
| Calls a method with reflection. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| the first element should be a Column representing literal string for the class name, |
| and the second element should be a Column representing literal string for the method name, |
| and the remaining are input arguments (Columns or column names) to the Java method. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.reflect` |
| :meth:`pyspark.sql.functions.try_reflect` |
| |
| Examples |
| -------- |
| Example 1: Reflecting a method call with a column argument |
| |
| >>> import pyspark.sql.functions as sf |
| >>> spark.range(1).select( |
| ... sf.java_method( |
| ... sf.lit("java.util.UUID"), |
| ... sf.lit("fromString"), |
| ... sf.lit("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2") |
| ... ) |
| ... ).show(truncate=False) |
| +-----------------------------------------------------------------------------+ |
| |java_method(java.util.UUID, fromString, a5cf6c42-0c85-418f-af6c-3e4e5b1328f2)| |
| +-----------------------------------------------------------------------------+ |
| |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | |
| +-----------------------------------------------------------------------------+ |
| |
| Example 2: Reflecting a method call with a column name argument |
| |
| >>> import pyspark.sql.functions as sf |
| >>> df = spark.createDataFrame([('a5cf6c42-0c85-418f-af6c-3e4e5b1328f2',)], ['a']) |
| >>> df.select( |
| ... sf.java_method(sf.lit('java.util.UUID'), sf.lit('fromString'), 'a') |
| ... ).show(truncate=False) |
| +------------------------------------------+ |
| |java_method(java.util.UUID, fromString, a)| |
| +------------------------------------------+ |
| |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | |
| +------------------------------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("java_method", cols) |
| |
| |
| @_try_remote_functions |
| def try_reflect(*cols: "ColumnOrName") -> Column: |
| """ |
| This is a special version of `reflect` that performs the same operation, but returns a NULL |
| value instead of raising an error if the invoke method thrown exception. |
| |
| |
| .. versionadded:: 4.0.0 |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| the first element should be a Column representing literal string for the class name, |
| and the second element should be a Column representing literal string for the method name, |
| and the remaining are input arguments (Columns or column names) to the Java method. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.reflect` |
| :meth:`pyspark.sql.functions.java_method` |
| |
| Examples |
| -------- |
| Example 1: Reflecting a method call with arguments |
| |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"]) |
| >>> df.select( |
| ... sf.try_reflect(sf.lit("java.util.UUID"), sf.lit("fromString"), "a") |
| ... ).show(truncate=False) |
| +------------------------------------------+ |
| |try_reflect(java.util.UUID, fromString, a)| |
| +------------------------------------------+ |
| |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 | |
| +------------------------------------------+ |
| |
| Example 2: Exception in the reflection call, resulting in null |
| |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select( |
| ... sf.try_reflect(sf.lit("scala.Predef"), sf.lit("require"), sf.lit(False)) |
| ... ).show(truncate=False) |
| +-----------------------------------------+ |
| |try_reflect(scala.Predef, require, false)| |
| +-----------------------------------------+ |
| |NULL | |
| +-----------------------------------------+ |
| """ |
| return _invoke_function_over_seq_of_columns("try_reflect", cols) |
| |
| |
| @_try_remote_functions |
| def version() -> Column: |
| """ |
| Returns the Spark version. The string contains 2 fields, the first being a release version |
| and the second being a git revision. |
| |
| .. versionadded:: 3.5.0 |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> spark.range(1).select(sf.version()).show(truncate=False) # doctest: +SKIP |
| +----------------------------------------------+ |
| |version() | |
| +----------------------------------------------+ |
| |4.0.0 4f8d1f575e99aeef8990c63a9614af0fc5479330| |
| +----------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("version") |
| |
| |
| @_try_remote_functions |
| def typeof(col: "ColumnOrName") -> Column: |
| """ |
| Return DDL-formatted type string for the data type of the input. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(True, 1, 1.0, 'xyz',)], ['a', 'b', 'c', 'd']) |
| >>> df.select(sf.typeof(df.a), sf.typeof(df.b), sf.typeof('c'), sf.typeof('d')).show() |
| +---------+---------+---------+---------+ |
| |typeof(a)|typeof(b)|typeof(c)|typeof(d)| |
| +---------+---------+---------+---------+ |
| | boolean| bigint| double| string| |
| +---------+---------+---------+---------+ |
| """ |
| return _invoke_function_over_columns("typeof", col) |
| |
| |
| @_try_remote_functions |
| def stack(*cols: "ColumnOrName") -> Column: |
| """ |
| Separates `col1`, ..., `colk` into `n` rows. Uses column names col0, col1, etc. by default |
| unless specified otherwise. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| cols : :class:`~pyspark.sql.Column` or column name |
| the first element should be a literal int for the number of rows to be separated, |
| and the remaining are input elements to be separated. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c']) |
| >>> df.select('*', sf.stack(sf.lit(2), df.a, df.b, 'c')).show() |
| +---+---+---+----+----+ |
| | a| b| c|col0|col1| |
| +---+---+---+----+----+ |
| | 1| 2| 3| 1| 2| |
| | 1| 2| 3| 3|NULL| |
| +---+---+---+----+----+ |
| |
| >>> df.select('*', sf.stack(sf.lit(2), df.a, df.b, 'c').alias('x', 'y')).show() |
| +---+---+---+---+----+ |
| | a| b| c| x| y| |
| +---+---+---+---+----+ |
| | 1| 2| 3| 1| 2| |
| | 1| 2| 3| 3|NULL| |
| +---+---+---+---+----+ |
| |
| >>> df.select('*', sf.stack(sf.lit(3), df.a, df.b, 'c')).show() |
| +---+---+---+----+ |
| | a| b| c|col0| |
| +---+---+---+----+ |
| | 1| 2| 3| 1| |
| | 1| 2| 3| 2| |
| | 1| 2| 3| 3| |
| +---+---+---+----+ |
| |
| >>> df.select('*', sf.stack(sf.lit(4), df.a, df.b, 'c')).show() |
| +---+---+---+----+ |
| | a| b| c|col0| |
| +---+---+---+----+ |
| | 1| 2| 3| 1| |
| | 1| 2| 3| 2| |
| | 1| 2| 3| 3| |
| | 1| 2| 3|NULL| |
| +---+---+---+----+ |
| """ |
| return _invoke_function_over_seq_of_columns("stack", cols) |
| |
| |
| @_try_remote_functions |
| def bitmap_bit_position(col: "ColumnOrName") -> Column: |
| """ |
| Returns the bit position for the given input column. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The input column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bitmap_bucket_number` |
| :meth:`pyspark.sql.functions.bitmap_construct_agg` |
| :meth:`pyspark.sql.functions.bitmap_count` |
| :meth:`pyspark.sql.functions.bitmap_or_agg` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(123,)], ['a']) |
| >>> df.select('*', sf.bitmap_bit_position('a')).show() |
| +---+----------------------+ |
| | a|bitmap_bit_position(a)| |
| +---+----------------------+ |
| |123| 122| |
| +---+----------------------+ |
| """ |
| return _invoke_function_over_columns("bitmap_bit_position", col) |
| |
| |
| @_try_remote_functions |
| def bitmap_bucket_number(col: "ColumnOrName") -> Column: |
| """ |
| Returns the bucket number for the given input column. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The input column. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bitmap_bit_position` |
| :meth:`pyspark.sql.functions.bitmap_construct_agg` |
| :meth:`pyspark.sql.functions.bitmap_count` |
| :meth:`pyspark.sql.functions.bitmap_or_agg` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(123,)], ['a']) |
| >>> df.select('*', sf.bitmap_bucket_number('a')).show() |
| +---+-----------------------+ |
| | a|bitmap_bucket_number(a)| |
| +---+-----------------------+ |
| |123| 1| |
| +---+-----------------------+ |
| """ |
| return _invoke_function_over_columns("bitmap_bucket_number", col) |
| |
| |
| @_try_remote_functions |
| def bitmap_construct_agg(col: "ColumnOrName") -> Column: |
| """ |
| Returns a bitmap with the positions of the bits set from all the values from the input column. |
| The input column will most likely be bitmap_bit_position(). |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The input column will most likely be bitmap_bit_position(). |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bitmap_bit_position` |
| :meth:`pyspark.sql.functions.bitmap_bucket_number` |
| :meth:`pyspark.sql.functions.bitmap_count` |
| :meth:`pyspark.sql.functions.bitmap_or_agg` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([(1,),(2,),(3,)], ["a"]) |
| >>> df.select( |
| ... sf.bitmap_construct_agg(sf.bitmap_bit_position('a')) |
| ... ).show() |
| +--------------------------------------------+ |
| |bitmap_construct_agg(bitmap_bit_position(a))| |
| +--------------------------------------------+ |
| | [07 00 00 00 00 0...| |
| +--------------------------------------------+ |
| """ |
| return _invoke_function_over_columns("bitmap_construct_agg", col) |
| |
| |
| @_try_remote_functions |
| def bitmap_count(col: "ColumnOrName") -> Column: |
| """ |
| Returns the number of set bits in the input bitmap. |
| |
| .. versionadded:: 3.5.0 |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The input bitmap. |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bitmap_bit_position` |
| :meth:`pyspark.sql.functions.bitmap_bucket_number` |
| :meth:`pyspark.sql.functions.bitmap_construct_agg` |
| :meth:`pyspark.sql.functions.bitmap_or_agg` |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("FFFF",)], ["a"]) |
| >>> df.select(sf.bitmap_count(sf.to_binary(df.a, sf.lit("hex")))).show() |
| +-------------------------------+ |
| |bitmap_count(to_binary(a, hex))| |
| +-------------------------------+ |
| | 16| |
| +-------------------------------+ |
| """ |
| return _invoke_function_over_columns("bitmap_count", col) |
| |
| |
| @_try_remote_functions |
| def bitmap_or_agg(col: "ColumnOrName") -> Column: |
| """ |
| Returns a bitmap that is the bitwise OR of all of the bitmaps from the input column. |
| The input column should be bitmaps created from bitmap_construct_agg(). |
| |
| .. versionadded:: 3.5.0 |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.bitmap_bit_position` |
| :meth:`pyspark.sql.functions.bitmap_bucket_number` |
| :meth:`pyspark.sql.functions.bitmap_construct_agg` |
| :meth:`pyspark.sql.functions.bitmap_count` |
| |
| Parameters |
| ---------- |
| col : :class:`~pyspark.sql.Column` or column name |
| The input column should be bitmaps created from bitmap_construct_agg(). |
| |
| Examples |
| -------- |
| >>> from pyspark.sql import functions as sf |
| >>> df = spark.createDataFrame([("10",),("20",),("40",)], ["a"]) |
| >>> df.select(sf.bitmap_or_agg(sf.to_binary(df.a, sf.lit("hex")))).show() |
| +--------------------------------+ |
| |bitmap_or_agg(to_binary(a, hex))| |
| +--------------------------------+ |
| | [70 00 00 00 00 0...| |
| +--------------------------------+ |
| """ |
| return _invoke_function_over_columns("bitmap_or_agg", col) |
| |
| |
| # ---------------------------- User Defined Function ---------------------------------- |
| |
| |
| @overload |
| def udf( |
| f: Callable[..., Any], |
| returnType: "DataTypeOrString" = StringType(), |
| *, |
| useArrow: Optional[bool] = None, |
| ) -> "UserDefinedFunctionLike": |
| ... |
| |
| |
| @overload |
| def udf( |
| f: Optional["DataTypeOrString"] = None, |
| *, |
| useArrow: Optional[bool] = None, |
| ) -> Callable[[Callable[..., Any]], "UserDefinedFunctionLike"]: |
| ... |
| |
| |
| @overload |
| def udf( |
| *, |
| returnType: "DataTypeOrString" = StringType(), |
| useArrow: Optional[bool] = None, |
| ) -> Callable[[Callable[..., Any]], "UserDefinedFunctionLike"]: |
| ... |
| |
| |
| @_try_remote_functions |
| def udf( |
| f: Optional[Union[Callable[..., Any], "DataTypeOrString"]] = None, |
| returnType: "DataTypeOrString" = StringType(), |
| *, |
| useArrow: Optional[bool] = None, |
| ) -> Union["UserDefinedFunctionLike", Callable[[Callable[..., Any]], "UserDefinedFunctionLike"]]: |
| """Creates a user defined function (UDF). |
| |
| .. versionadded:: 1.3.0 |
| |
| .. versionchanged:: 3.4.0 |
| Supports Spark Connect. |
| |
| .. versionchanged:: 4.0.0 |
| Supports keyword-arguments. |
| |
| Parameters |
| ---------- |
| f : function, optional |
| python function if used as a standalone function |
| |
| .. versionchanged:: 4.1.0 |
| Supports vectorized function by specifiying the type hints. |
| |
| returnType : :class:`pyspark.sql.types.DataType` or str, optional |
| the return type of the user-defined function. The value can be either a |
| :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. |
| Defaults to :class:`StringType`. |
| useArrow : bool, optional |
| whether to use Arrow to optimize the (de)serialization. When it is None, the |
| Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes effect. |
| |
| Examples |
| -------- |
| >>> from pyspark.sql.types import IntegerType |
| >>> slen = udf(lambda s: len(s), IntegerType()) |
| >>> @udf |
| ... def to_upper(s): |
| ... if s is not None: |
| ... return s.upper() |
| ... |
| >>> @udf(returnType=IntegerType()) |
| ... def add_one(x): |
| ... if x is not None: |
| ... return x + 1 |
| ... |
| >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) |
| >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show() |
| +----------+--------------+------------+ |
| |slen(name)|to_upper(name)|add_one(age)| |
| +----------+--------------+------------+ |
| | 8| JOHN DOE| 22| |
| +----------+--------------+------------+ |
| |
| UDF can use keyword arguments: |
| |
| >>> @udf(returnType=IntegerType()) |
| ... def calc(a, b): |
| ... return a + 10 * b |
| ... |
| >>> spark.range(2).select(calc(b=col("id") * 10, a=col("id"))).show() |
| +-----------------------------+ |
| |calc(b => (id * 10), a => id)| |
| +-----------------------------+ |
| | 0| |
| | 101| |
| +-----------------------------+ |
| |
| Support vectorized function by specifiying the type hints. |
| |
| To define a vectorized function, the function should meet following requirements: |
| |
| 1, have at least 1 argument. 0-arg is not supported; |
| |
| 2, the type hints should match one of the patterns of pandas UDFs and arrow UDFs; |
| |
| 3, argument `useArrow` should not be explictly set; |
| |
| If a function doesn't meet the requirements, the function should be treated as a |
| vanilla python UDF or arrow-optimized python UDF (depending on argument `useArrow`, |
| configuration `spark.sql.execution.pythonUDF.arrow.enabled`, and dependency installations) |
| |
| For example, define a 'Series to Series' type pandas UDF. |
| |
| >>> from pyspark.sql.functions import udf, PandasUDFType |
| >>> import pandas as pd |
| >>> @udf(returnType=IntegerType()) |
| ... def pd_calc(a: pd.Series, b: pd.Series) -> pd.Series: |
| ... return a + 10 * b |
| ... |
| >>> pd_calc.evalType == PandasUDFType.SCALAR |
| True |
| >>> spark.range(2).select(pd_calc(b=col("id") * 10, a="id")).show() |
| +--------------------------------+ |
| |pd_calc(b => (id * 10), a => id)| |
| +--------------------------------+ |
| | 0| |
| | 101| |
| +--------------------------------+ |
| |
| For another example, define a 'Array to Array' type arrow UDF. |
| |
| >>> from pyspark.sql.functions import udf, ArrowUDFType |
| >>> import pyarrow as pa |
| >>> @udf(returnType=IntegerType()) |
| ... def pa_calc(a: pa.Array, b: pa.Array) -> pa.Array: |
| ... return pa.compute.add(a, pa.compute.multiply(b, 10)) |
| ... |
| >>> pa_calc.evalType == ArrowUDFType.SCALAR |
| True |
| >>> spark.range(2).select(pa_calc(b=col("id") * 10, a="id")).show() |
| +--------------------------------+ |
| |pa_calc(b => (id * 10), a => id)| |
| +--------------------------------+ |
| | 0| |
| | 101| |
| +--------------------------------+ |
| |
| See Also |
| -------- |
| :meth:`pyspark.sql.functions.pandas_udf` |
| :meth:`pyspark.sql.functions.arrow_udf` |
| |
| Notes |
| ----- |
| The user-defined functions are considered deterministic by default. Due to |
| optimization, duplicate invocations may be eliminated or the function may even be invoked |
| more times than it is present in the query. If your function is not deterministic, call |
| `asNondeterministic` on the user defined function. E.g.: |
| |
| >>> from pyspark.sql.types import IntegerType |
| >>> import random |
| >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic() |
| |
| The user-defined functions do not support conditional expressions or short circuiting |
| in boolean expressions and it ends up with being executed all internally. If the functions |
| can fail on special rows, the workaround is to incorporate the condition into the functions. |
| |
| The user-defined functions do not take keyword arguments on the calling side. |
| """ |
| |
| # The following table shows most of Python data and SQL type conversions in normal UDFs that |
| # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near |
| # future. The table might have to be eventually documented externally. |
| # Please see python/pyspark/sql/tests/udf_type_tests for type tests and golden files |
| |
| # decorator @udf, @udf(), @udf(dataType()) |
| if f is None or isinstance(f, (str, DataType)): |
| # If DataType has been passed as a positional argument |
| # for decorator use it as a returnType |
| return_type = f or returnType |
| return functools.partial( |
| _create_py_udf, |
| returnType=return_type, |
| useArrow=useArrow, |
| ) |
| else: |
| return _create_py_udf(f=f, returnType=returnType, useArrow=useArrow) |
| |
| |
| @_try_remote_functions |
| def udtf( |
| cls: Optional[Type] = None, |
| *, |
| returnType: Optional[Union[StructType, str]] = None, |
| useArrow: Optional[bool] = None, |
| ) -> Union["UserDefinedTableFunction", Callable[[Type], "UserDefinedTableFunction"]]: |
| """Creates a user defined table function (UDTF). |
| |
| .. versionadded:: 3.5.0 |
| |
| .. versionchanged:: 4.0.0 |
| Supports Python side analysis. |
| |
| .. versionchanged:: 4.0.0 |
| Supports keyword-arguments. |
| |
| Parameters |
| ---------- |
| cls : class, optional |
| the Python user-defined table function handler class. |
| returnType : :class:`pyspark.sql.types.StructType` or str, optional |
| the return type of the user-defined table function. The value can be either a |
| :class:`pyspark.sql.types.StructType` object or a DDL-formatted struct type string. |
| If None, the handler class must provide `analyze` static method. |
| useArrow : bool, optional |
| whether to use Arrow to optimize the (de)serializations. When it's set to None, the |
| Spark config "spark.sql.execution.pythonUDTF.arrow.enabled" is used. |
| |
| Examples |
| -------- |
| Implement the UDTF class and create a UDTF: |
| |
| >>> class TestUDTF: |
| ... def eval(self, *args: Any): |
| ... yield "hello", "world" |
| ... |
| >>> from pyspark.sql.functions import udtf |
| >>> test_udtf = udtf(TestUDTF, returnType="c1: string, c2: string") |
| >>> test_udtf().show() |
| +-----+-----+ |
| | c1| c2| |
| +-----+-----+ |
| |hello|world| |
| +-----+-----+ |
| |
| UDTF can also be created using the decorator syntax: |
| |
| >>> @udtf(returnType="c1: int, c2: int") |
| ... class PlusOne: |
| ... def eval(self, x: int): |
| ... yield x, x + 1 |
| ... |
| >>> from pyspark.sql.functions import lit |
| >>> PlusOne(lit(1)).show() |
| +---+---+ |
| | c1| c2| |
| +---+---+ |
| | 1| 2| |
| +---+---+ |
| |
| UDTF can also have `analyze` static method instead of a static return type: |
| |
| The `analyze` static method should take arguments: |
| |
| - The number and order of arguments are the same as the UDTF inputs |
| - Each argument is a :class:`pyspark.sql.udtf.AnalyzeArgument`, containing: |
| - dataType: DataType |
| - value: Any: the calculated value if the argument is foldable; otherwise None |
| - isTable: bool: True if the argument is a table argument |
| |
| and return a :class:`pyspark.sql.udtf.AnalyzeResult`, containing. |
| |
| - schema: StructType |
| |
| >>> from pyspark.sql.udtf import AnalyzeArgument, AnalyzeResult |
| >>> # or from pyspark.sql.functions import AnalyzeArgument, AnalyzeResult |
| >>> @udtf |
| ... class TestUDTFWithAnalyze: |
| ... @staticmethod |
| ... def analyze(a: AnalyzeArgument, b: AnalyzeArgument) -> AnalyzeResult: |
| ... return AnalyzeResult(StructType().add("a", a.dataType).add("b", b.dataType)) |
| ... |
| ... def eval(self, a, b): |
| ... yield a, b |
| ... |
| >>> TestUDTFWithAnalyze(lit(1), lit("x")).show() |
| +---+---+ |
| | a| b| |
| +---+---+ |
| | 1| x| |
| +---+---+ |
| |
| UDTF can use keyword arguments: |
| |
| >>> @udtf |
| ... class TestUDTFWithKwargs: |
| ... @staticmethod |
| ... def analyze( |
| ... a: AnalyzeArgument, b: AnalyzeArgument, **kwargs: AnalyzeArgument |
| ... ) -> AnalyzeResult: |
| ... return AnalyzeResult( |
| ... StructType().add("a", a.dataType) |
| ... .add("b", b.dataType) |
| ... .add("x", kwargs["x"].dataType) |
| ... ) |
| ... |
| ... def eval(self, a, b, **kwargs): |
| ... yield a, b, kwargs["x"] |
| ... |
| >>> TestUDTFWithKwargs(lit(1), x=lit("x"), b=lit("b")).show() |
| +---+---+---+ |
| | a| b| x| |
| +---+---+---+ |
| | 1| b| x| |
| +---+---+---+ |
| |
| >>> _ = spark.udtf.register("test_udtf", TestUDTFWithKwargs) |
| >>> spark.sql("SELECT * FROM test_udtf(1, x => 'x', b => 'b')").show() |
| +---+---+---+ |
| | a| b| x| |
| +---+---+---+ |
| | 1| b| x| |
| +---+---+---+ |
| |
| Arrow optimization can be explicitly enabled when creating UDTFs: |
| |
| >>> @udtf(returnType="c1: int, c2: int", useArrow=True) |
| ... class ArrowPlusOne: |
| ... def eval(self, x: int): |
| ... yield x, x + 1 |
| ... |
| >>> ArrowPlusOne(lit(1)).show() |
| +---+---+ |
| | c1| c2| |
| +---+---+ |
| | 1| 2| |
| +---+---+ |
| |
| Notes |
| ----- |
| User-defined table functions (UDTFs) are considered non-deterministic by default. |
| Use `asDeterministic()` to mark a function as deterministic. E.g.: |
| |
| >>> class PlusOne: |
| ... def eval(self, a: int): |
| ... yield a + 1, |
| >>> plus_one = udtf(PlusOne, returnType="r: int").asDeterministic() |
| |
| Use "yield" to produce one row for the UDTF result relation as many times |
| as needed. In the context of a lateral join, each such result row will be |
| associated with the most recent input row consumed from the "eval" method. |
| |
| User-defined table functions are considered opaque to the optimizer by default. |
| As a result, operations like filters from WHERE clauses or limits from |
| LIMIT/OFFSET clauses that appear after the UDTF call will execute on the |
| UDTF's result relation. By the same token, any relations forwarded as input |
| to UDTFs will plan as full table scans in the absence of any explicit such |
| filtering or other logic explicitly written in a table subquery surrounding the |
| provided input relation. |
| |
| User-defined table functions do not accept keyword arguments on the calling side. |
| """ |
| if cls is None: |
| return functools.partial(_create_py_udtf, returnType=returnType, useArrow=useArrow) |
| else: |
| return _create_py_udtf(cls=cls, returnType=returnType, useArrow=useArrow) |
| |
| |
| @_try_remote_functions |
| def arrow_udtf( |
| cls: Optional[Type] = None, |
| *, |
| returnType: Optional[Union[StructType, str]] = None, |
| ) -> Union["UserDefinedTableFunction", Callable[[Type], "UserDefinedTableFunction"]]: |
| """Creates a PyArrow-native user defined table function (UDTF). |
| |
| This function provides a PyArrow-native interface for UDTFs, where the eval method |
| receives PyArrow RecordBatches or Arrays and returns an Iterator of PyArrow Tables |
| or RecordBatches. |
| This enables true vectorized computation without row-by-row processing overhead. |
| |
| .. versionadded:: 4.1.0 |
| |
| Parameters |
| ---------- |
| cls : class, optional |
| the Python user-defined table function handler class. |
| returnType : :class:`pyspark.sql.types.StructType` or str, optional |
| the return type of the user-defined table function. The value can be either a |
| :class:`pyspark.sql.types.StructType` object or a DDL-formatted struct type string. |
| |
| Examples |
| -------- |
| UDTF with PyArrow RecordBatch input: |
| |
| >>> import pyarrow as pa |
| >>> from pyspark.sql.functions import arrow_udtf |
| >>> @arrow_udtf(returnType="x int, y int") |
| ... class MyUDTF: |
| ... def eval(self, batch: pa.RecordBatch): |
| ... # Process the entire batch vectorized |
| ... x_array = batch.column('x') |
| ... y_array = batch.column('y') |
| ... result_table = pa.table({ |
| ... 'x': x_array, |
| ... 'y': y_array |
| ... }) |
| ... yield result_table |
| ... |
| >>> df = spark.range(10).selectExpr("id as x", "id as y") |
| >>> MyUDTF(df.asTable()).show() # doctest: +SKIP |
| |
| UDTF with PyArrow Array inputs: |
| |
| >>> @arrow_udtf(returnType="x int, y int") |
| ... class MyUDTF2: |
| ... def eval(self, x: pa.Array, y: pa.Array): |
| ... # Process arrays vectorized |
| ... result_table = pa.table({ |
| ... 'x': x, |
| ... 'y': y |
| ... }) |
| ... yield result_table |
| ... |
| >>> MyUDTF2(lit(1), lit(2)).show() # doctest: +SKIP |
| |
| Notes |
| ----- |
| - The eval method must accept PyArrow RecordBatches or Arrays as input |
| - The eval method must yield PyArrow Tables or RecordBatches as output |
| """ |
| if cls is None: |
| return functools.partial(_create_pyarrow_udtf, returnType=returnType) |
| else: |
| return _create_pyarrow_udtf(cls=cls, returnType=returnType) |
| |
| |
| def _test() -> None: |
| import doctest |
| from pyspark.sql import SparkSession |
| import pyspark.sql.functions.builtin |
| |
| globs = pyspark.sql.functions.builtin.__dict__.copy() |
| spark = ( |
| SparkSession.builder.master("local[4]").appName("sql.functions.builtin tests").getOrCreate() |
| ) |
| sc = spark.sparkContext |
| globs["sc"] = sc |
| globs["spark"] = spark |
| (failure_count, test_count) = doctest.testmod( |
| pyspark.sql.functions.builtin, |
| globs=globs, |
| optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, |
| ) |
| spark.stop() |
| if failure_count: |
| sys.exit(-1) |
| |
| |
| if __name__ == "__main__": |
| _test() |