| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| """ |
| String functions on pandas-on-Spark Series |
| """ |
| from typing import ( |
| Any, |
| Callable, |
| Dict, |
| List, |
| Optional, |
| Union, |
| TYPE_CHECKING, |
| cast, |
| no_type_check, |
| ) |
| |
| import numpy as np |
| |
| import pandas as pd |
| from pyspark.sql.types import StringType, BinaryType, ArrayType, LongType, MapType |
| from pyspark.sql import functions as F |
| from pyspark.sql.functions import pandas_udf |
| |
| from pyspark.pandas.spark import functions as SF |
| |
| if TYPE_CHECKING: |
| import pyspark.pandas as ps # noqa: F401 (SPARK-34943) |
| |
| |
| class StringMethods(object): |
| """String methods for pandas-on-Spark Series""" |
| |
| def __init__(self, series: "ps.Series"): |
| if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)): |
| raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type)) |
| self._data = series |
| |
| # Methods |
| def capitalize(self) -> "ps.Series": |
| """ |
| Convert Strings in the series to be capitalized. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) |
| >>> s |
| 0 lower |
| 1 CAPITALS |
| 2 this is a sentence |
| 3 SwApCaSe |
| dtype: object |
| |
| >>> s.str.capitalize() |
| 0 Lower |
| 1 Capitals |
| 2 This is a sentence |
| 3 Swapcase |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_capitalize(s) -> "ps.Series[str]": |
| return s.str.capitalize() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_capitalize) |
| |
| def title(self) -> "ps.Series": |
| """ |
| Convert Strings in the series to be titlecase. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) |
| >>> s |
| 0 lower |
| 1 CAPITALS |
| 2 this is a sentence |
| 3 SwApCaSe |
| dtype: object |
| |
| >>> s.str.title() |
| 0 Lower |
| 1 Capitals |
| 2 This Is A Sentence |
| 3 Swapcase |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_title(s) -> "ps.Series[str]": |
| return s.str.title() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_title) |
| |
| def lower(self) -> "ps.Series": |
| """ |
| Convert strings in the Series/Index to all lowercase. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) |
| >>> s |
| 0 lower |
| 1 CAPITALS |
| 2 this is a sentence |
| 3 SwApCaSe |
| dtype: object |
| |
| >>> s.str.lower() |
| 0 lower |
| 1 capitals |
| 2 this is a sentence |
| 3 swapcase |
| dtype: object |
| """ |
| return self._data.spark.transform(F.lower) |
| |
| def upper(self) -> "ps.Series": |
| """ |
| Convert strings in the Series/Index to all uppercase. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) |
| >>> s |
| 0 lower |
| 1 CAPITALS |
| 2 this is a sentence |
| 3 SwApCaSe |
| dtype: object |
| |
| >>> s.str.upper() |
| 0 LOWER |
| 1 CAPITALS |
| 2 THIS IS A SENTENCE |
| 3 SWAPCASE |
| dtype: object |
| """ |
| return self._data.spark.transform(F.upper) |
| |
| def swapcase(self) -> "ps.Series": |
| """ |
| Convert strings in the Series/Index to be swapcased. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) |
| >>> s |
| 0 lower |
| 1 CAPITALS |
| 2 this is a sentence |
| 3 SwApCaSe |
| dtype: object |
| |
| >>> s.str.swapcase() |
| 0 LOWER |
| 1 capitals |
| 2 THIS IS A SENTENCE |
| 3 sWaPcAsE |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_swapcase(s) -> "ps.Series[str]": |
| return s.str.swapcase() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_swapcase) |
| |
| def startswith(self, pattern: str, na: Optional[Any] = None) -> "ps.Series": |
| """ |
| Test if the start of each string element matches a pattern. |
| |
| Equivalent to :func:`str.startswith`. |
| |
| Parameters |
| ---------- |
| pattern : str |
| Character sequence. Regular expressions are not accepted. |
| na : object, default None |
| Object shown if element is not a string. NaN converted to None. |
| |
| Returns |
| ------- |
| Series of bool or object |
| pandas-on-Spark Series of booleans indicating whether the given pattern |
| matches the start of each string element. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['bat', 'Bear', 'cat', np.nan]) |
| >>> s |
| 0 bat |
| 1 Bear |
| 2 cat |
| 3 None |
| dtype: object |
| |
| >>> s.str.startswith('b') |
| 0 True |
| 1 False |
| 2 False |
| 3 None |
| dtype: object |
| |
| Specifying na to be False instead of None. |
| |
| >>> s.str.startswith('b', na=False) |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_startswith(s) -> "ps.Series[bool]": |
| return s.str.startswith(pattern, na) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_startswith) |
| |
| def endswith(self, pattern: str, na: Optional[Any] = None) -> "ps.Series": |
| """ |
| Test if the end of each string element matches a pattern. |
| |
| Equivalent to :func:`str.endswith`. |
| |
| Parameters |
| ---------- |
| pattern : str |
| Character sequence. Regular expressions are not accepted. |
| na : object, default None |
| Object shown if element is not a string. NaN converted to None. |
| |
| Returns |
| ------- |
| Series of bool or object |
| pandas-on-Spark Series of booleans indicating whether the given pattern |
| matches the end of each string element. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['bat', 'Bear', 'cat', np.nan]) |
| >>> s |
| 0 bat |
| 1 Bear |
| 2 cat |
| 3 None |
| dtype: object |
| |
| >>> s.str.endswith('t') |
| 0 True |
| 1 False |
| 2 True |
| 3 None |
| dtype: object |
| |
| Specifying na to be False instead of None. |
| |
| >>> s.str.endswith('t', na=False) |
| 0 True |
| 1 False |
| 2 True |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_endswith(s) -> "ps.Series[bool]": |
| return s.str.endswith(pattern, na) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_endswith) |
| |
| def strip(self, to_strip: Optional[str] = None) -> "ps.Series": |
| """ |
| Remove leading and trailing characters. |
| |
| Strip whitespaces (including newlines) or a set of specified |
| characters from each string in the Series/Index from left and |
| right sides. Equivalent to :func:`str.strip`. |
| |
| Parameters |
| ---------- |
| to_strip : str |
| Specifying the set of characters to be removed. All combinations |
| of this set of characters will be stripped. If None then |
| whitespaces are removed. |
| |
| Returns |
| ------- |
| Series of objects |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None]) |
| >>> s |
| 0 1. Ant. |
| 1 2. Bee!\\t |
| 2 None |
| dtype: object |
| |
| >>> s.str.strip() |
| 0 1. Ant. |
| 1 2. Bee! |
| 2 None |
| dtype: object |
| |
| >>> s.str.strip('12.') |
| 0 Ant |
| 1 Bee!\\t |
| 2 None |
| dtype: object |
| |
| >>> s.str.strip('.!\\t') |
| 0 1. Ant |
| 1 2. Bee |
| 2 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_strip(s) -> "ps.Series[str]": |
| return s.str.strip(to_strip) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_strip) |
| |
| def lstrip(self, to_strip: Optional[str] = None) -> "ps.Series": |
| """ |
| Remove leading characters. |
| |
| Strip whitespaces (including newlines) or a set of specified |
| characters from each string in the Series/Index from left side. |
| Equivalent to :func:`str.lstrip`. |
| |
| Parameters |
| ---------- |
| to_strip : str |
| Specifying the set of characters to be removed. All combinations |
| of this set of characters will be stripped. If None then |
| whitespaces are removed. |
| |
| Returns |
| ------- |
| Series of object |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None]) |
| >>> s |
| 0 1. Ant. |
| 1 2. Bee!\\t |
| 2 None |
| dtype: object |
| |
| >>> s.str.lstrip('12.') |
| 0 Ant. |
| 1 Bee!\\t |
| 2 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_lstrip(s) -> "ps.Series[str]": |
| return s.str.lstrip(to_strip) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_lstrip) |
| |
| def rstrip(self, to_strip: Optional[str] = None) -> "ps.Series": |
| """ |
| Remove trailing characters. |
| |
| Strip whitespaces (including newlines) or a set of specified |
| characters from each string in the Series/Index from right side. |
| Equivalent to :func:`str.rstrip`. |
| |
| Parameters |
| ---------- |
| to_strip : str |
| Specifying the set of characters to be removed. All combinations |
| of this set of characters will be stripped. If None then |
| whitespaces are removed. |
| |
| Returns |
| ------- |
| Series of object |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None]) |
| >>> s |
| 0 1. Ant. |
| 1 2. Bee!\\t |
| 2 None |
| dtype: object |
| |
| >>> s.str.rstrip('.!\\t') |
| 0 1. Ant |
| 1 2. Bee |
| 2 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_rstrip(s) -> "ps.Series[str]": |
| return s.str.rstrip(to_strip) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_rstrip) |
| |
| def get(self, i: int) -> "ps.Series": |
| """ |
| Extract element from each string or string list/tuple in the Series |
| at the specified position. |
| |
| Parameters |
| ---------- |
| i : int |
| Position of element to extract. |
| |
| Returns |
| ------- |
| Series of objects |
| |
| Examples |
| -------- |
| >>> s1 = ps.Series(["String", "123"]) |
| >>> s1 |
| 0 String |
| 1 123 |
| dtype: object |
| |
| >>> s1.str.get(1) |
| 0 t |
| 1 2 |
| dtype: object |
| |
| >>> s1.str.get(-1) |
| 0 g |
| 1 3 |
| dtype: object |
| |
| >>> s2 = ps.Series([["a", "b", "c"], ["x", "y"]]) |
| >>> s2 |
| 0 [a, b, c] |
| 1 [x, y] |
| dtype: object |
| |
| >>> s2.str.get(0) |
| 0 a |
| 1 x |
| dtype: object |
| |
| >>> s2.str.get(2) |
| 0 c |
| 1 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_get(s) -> "ps.Series[str]": |
| return s.str.get(i) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_get) |
| |
| def isalnum(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are alphanumeric. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isalnum` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s1 = ps.Series(['one', 'one1', '1', '']) |
| |
| >>> s1.str.isalnum() |
| 0 True |
| 1 True |
| 2 True |
| 3 False |
| dtype: bool |
| |
| Note that checks against characters mixed with any additional |
| punctuation or whitespace will evaluate to false for an alphanumeric |
| check. |
| |
| >>> s2 = ps.Series(['A B', '1.5', '3,000']) |
| >>> s2.str.isalnum() |
| 0 False |
| 1 False |
| 2 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isalnum(s) -> "ps.Series[bool]": |
| return s.str.isalnum() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isalnum) |
| |
| def isalpha(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are alphabetic. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isalpha` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s1 = ps.Series(['one', 'one1', '1', '']) |
| |
| >>> s1.str.isalpha() |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isalpha(s) -> "ps.Series[bool]": |
| return s.str.isalpha() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isalpha) |
| |
| def isdigit(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are digits. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isdigit` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['23', '³', '⅕', '']) |
| |
| The s.str.isdecimal method checks for characters used to form numbers |
| in base 10. |
| |
| >>> s.str.isdecimal() |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| dtype: bool |
| |
| The s.str.isdigit method is the same as s.str.isdecimal but also |
| includes special digits, like superscripted and subscripted digits in |
| unicode. |
| |
| >>> s.str.isdigit() |
| 0 True |
| 1 True |
| 2 False |
| 3 False |
| dtype: bool |
| |
| The s.str.isnumeric method is the same as s.str.isdigit but also |
| includes other characters that can represent quantities such as unicode |
| fractions. |
| |
| >>> s.str.isnumeric() |
| 0 True |
| 1 True |
| 2 True |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isdigit(s) -> "ps.Series[bool]": |
| return s.str.isdigit() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isdigit) |
| |
| def isspace(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are whitespaces. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isspace` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s = ps.Series([' ', '\\t\\r\\n ', '']) |
| >>> s.str.isspace() |
| 0 True |
| 1 True |
| 2 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isspace(s) -> "ps.Series[bool]": |
| return s.str.isspace() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isspace) |
| |
| def islower(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are lowercase. |
| |
| This is equivalent to running the Python string method |
| :func:`str.islower` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) |
| >>> s.str.islower() |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isspace(s) -> "ps.Series[bool]": |
| return s.str.islower() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isspace) |
| |
| def isupper(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are uppercase. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isupper` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) |
| >>> s.str.isupper() |
| 0 False |
| 1 False |
| 2 True |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isspace(s) -> "ps.Series[bool]": |
| return s.str.isupper() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isspace) |
| |
| def istitle(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are titlecase. |
| |
| This is equivalent to running the Python string method |
| :func:`str.istitle` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) |
| |
| The s.str.istitle method checks for whether all words are in title |
| case (whether only the first letter of each word is capitalized). |
| Words are assumed to be as any sequence of non-numeric characters |
| separated by whitespace characters. |
| |
| >>> s.str.istitle() |
| 0 False |
| 1 True |
| 2 False |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_istitle(s) -> "ps.Series[bool]": |
| return s.str.istitle() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_istitle) |
| |
| def isnumeric(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are numeric. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isnumeric` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s1 = ps.Series(['one', 'one1', '1', '']) |
| >>> s1.str.isnumeric() |
| 0 False |
| 1 False |
| 2 True |
| 3 False |
| dtype: bool |
| |
| >>> s2 = ps.Series(['23', '³', '⅕', '']) |
| |
| The s2.str.isdecimal method checks for characters used to form numbers |
| in base 10. |
| |
| >>> s2.str.isdecimal() |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| dtype: bool |
| |
| The s2.str.isdigit method is the same as s2.str.isdecimal but also |
| includes special digits, like superscripted and subscripted digits in |
| unicode. |
| |
| >>> s2.str.isdigit() |
| 0 True |
| 1 True |
| 2 False |
| 3 False |
| dtype: bool |
| |
| The s2.str.isnumeric method is the same as s2.str.isdigit but also |
| includes other characters that can represent quantities such as unicode |
| fractions. |
| |
| >>> s2.str.isnumeric() |
| 0 True |
| 1 True |
| 2 True |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isnumeric(s) -> "ps.Series[bool]": |
| return s.str.isnumeric() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isnumeric) |
| |
| def isdecimal(self) -> "ps.Series": |
| """ |
| Check whether all characters in each string are decimals. |
| |
| This is equivalent to running the Python string method |
| :func:`str.isdecimal` for each element of the Series/Index. |
| If a string has zero characters, False is returned for that check. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['23', '³', '⅕', '']) |
| |
| The s.str.isdecimal method checks for characters used to form numbers |
| in base 10. |
| |
| >>> s.str.isdecimal() |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| dtype: bool |
| |
| The s.str.isdigit method is the same as s.str.isdecimal but also |
| includes special digits, like superscripted and subscripted digits in |
| unicode. |
| |
| >>> s.str.isdigit() |
| 0 True |
| 1 True |
| 2 False |
| 3 False |
| dtype: bool |
| |
| The s.str.isnumeric method is the same as s.str.isdigit but also |
| includes other characters that can represent quantities such as unicode |
| fractions. |
| |
| >>> s.str.isnumeric() |
| 0 True |
| 1 True |
| 2 True |
| 3 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_isdecimal(s) -> "ps.Series[bool]": |
| return s.str.isdecimal() |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_isdecimal) |
| |
| @no_type_check |
| def cat(self, others=None, sep=None, na_rep=None, join=None) -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| def center(self, width: int, fillchar: str = " ") -> "ps.Series": |
| """ |
| Filling left and right side of strings in the Series/Index with an |
| additional character. Equivalent to :func:`str.center`. |
| |
| Parameters |
| ---------- |
| width : int |
| Minimum width of resulting string; additional characters will be |
| filled with fillchar. |
| fillchar : str |
| Additional character for filling, default is whitespace. |
| |
| Returns |
| ------- |
| Series of objects |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["caribou", "tiger"]) |
| >>> s |
| 0 caribou |
| 1 tiger |
| dtype: object |
| |
| >>> s.str.center(width=10, fillchar='-') |
| 0 -caribou-- |
| 1 --tiger--- |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_center(s) -> "ps.Series[str]": |
| return s.str.center(width, fillchar) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_center) |
| |
| def contains( |
| self, pat: str, case: bool = True, flags: int = 0, na: Any = None, regex: bool = True |
| ) -> "ps.Series": |
| """ |
| Test if pattern or regex is contained within a string of a Series. |
| |
| Return boolean Series based on whether a given pattern or regex is |
| contained within a string of a Series. |
| |
| Analogous to :func:`match`, but less strict, relying on |
| :func:`re.search` instead of :func:`re.match`. |
| |
| Parameters |
| ---------- |
| pat : str |
| Character sequence or regular expression. |
| case : bool, default True |
| If True, case sensitive. |
| flags : int, default 0 (no flags) |
| Flags to pass through to the re module, e.g. re.IGNORECASE. |
| na : default None |
| Fill value for missing values. NaN converted to None. |
| regex : bool, default True |
| If True, assumes the pat is a regular expression. |
| If False, treats the pat as a literal string. |
| |
| |
| Returns |
| ------- |
| Series of boolean values or object |
| A Series of boolean values indicating whether the given pattern is |
| contained within the string of each element of the Series. |
| |
| Examples |
| -------- |
| Returning a Series of booleans using only a literal pattern. |
| |
| >>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) |
| >>> s1.str.contains('og', regex=False) |
| 0 False |
| 1 True |
| 2 False |
| 3 False |
| 4 None |
| dtype: object |
| |
| Specifying case sensitivity using case. |
| |
| >>> s1.str.contains('oG', case=True, regex=True) |
| 0 False |
| 1 False |
| 2 False |
| 3 False |
| 4 None |
| dtype: object |
| |
| Specifying na to be False instead of NaN replaces NaN values with |
| False. If Series does not contain NaN values the resultant dtype will |
| be bool, otherwise, an object dtype. |
| |
| >>> s1.str.contains('og', na=False, regex=True) |
| 0 False |
| 1 True |
| 2 False |
| 3 False |
| 4 False |
| dtype: bool |
| |
| Returning ‘house’ or ‘dog’ when either expression occurs in a string. |
| |
| >>> s1.str.contains('house|dog', regex=True) |
| 0 False |
| 1 True |
| 2 True |
| 3 False |
| 4 None |
| dtype: object |
| |
| Ignoring case sensitivity using flags with regex. |
| |
| >>> import re |
| >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) |
| 0 False |
| 1 False |
| 2 True |
| 3 False |
| 4 None |
| dtype: object |
| |
| Returning any digit using regular expression. |
| |
| >>> s1.str.contains('[0-9]', regex=True) |
| 0 False |
| 1 False |
| 2 False |
| 3 True |
| 4 None |
| dtype: object |
| |
| Ensure pat is a not a literal pattern when regex is set to True. |
| Note in the following example one might expect only s2[1] and s2[3] |
| to return True. However, ‘.0’ as a regex matches any character followed |
| by a 0. |
| |
| >>> s2 = ps.Series(['40','40.0','41','41.0','35']) |
| >>> s2.str.contains('.0', regex=True) |
| 0 True |
| 1 True |
| 2 False |
| 3 True |
| 4 False |
| dtype: bool |
| """ |
| |
| @no_type_check |
| def pandas_contains(s) -> "ps.Series[bool]": |
| return s.str.contains(pat, case, flags, na, regex) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_contains) |
| |
| def count(self, pat: str, flags: int = 0) -> "ps.Series": |
| """ |
| Count occurrences of pattern in each string of the Series. |
| |
| This function is used to count the number of times a particular regex |
| pattern is repeated in each of the string elements of the Series. |
| |
| Parameters |
| ---------- |
| pat : str |
| Valid regular expression. |
| flags : int, default 0 (no flags) |
| Flags for the re module. |
| |
| Returns |
| ------- |
| Series of int |
| A Series containing the integer counts of pattern matches. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.NaN, 'CABA', 'cat']) |
| >>> s.str.count('a') |
| 0 0.0 |
| 1 0.0 |
| 2 2.0 |
| 3 2.0 |
| 4 NaN |
| 5 0.0 |
| 6 1.0 |
| dtype: float64 |
| |
| Escape '$' to find the literal dollar sign. |
| |
| >>> s = ps.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) |
| >>> s.str.count('\\$') |
| 0 1 |
| 1 0 |
| 2 1 |
| 3 2 |
| 4 2 |
| 5 0 |
| dtype: int64 |
| """ |
| |
| @no_type_check |
| def pandas_count(s) -> "ps.Series[int]": |
| return s.str.count(pat, flags) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_count) |
| |
| @no_type_check |
| def decode(self, encoding, errors="strict") -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| @no_type_check |
| def encode(self, encoding, errors="strict") -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| @no_type_check |
| def extract(self, pat, flags=0, expand=True) -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| @no_type_check |
| def extractall(self, pat, flags=0) -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| def find(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series": |
| """ |
| Return lowest indexes in each strings in the Series where the |
| substring is fully contained between [start:end]. |
| |
| Return -1 on failure. Equivalent to standard :func:`str.find`. |
| |
| Parameters |
| ---------- |
| sub : str |
| Substring being searched. |
| start : int |
| Left edge index. |
| end : int |
| Right edge index. |
| |
| Returns |
| ------- |
| Series of int |
| Series of lowest matching indexes. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['apple', 'oranges', 'bananas']) |
| |
| >>> s.str.find('a') |
| 0 0 |
| 1 2 |
| 2 1 |
| dtype: int64 |
| |
| >>> s.str.find('a', start=2) |
| 0 -1 |
| 1 2 |
| 2 3 |
| dtype: int64 |
| |
| >>> s.str.find('a', end=1) |
| 0 0 |
| 1 -1 |
| 2 -1 |
| dtype: int64 |
| |
| >>> s.str.find('a', start=2, end=2) |
| 0 -1 |
| 1 -1 |
| 2 -1 |
| dtype: int64 |
| """ |
| |
| @no_type_check |
| def pandas_find(s) -> "ps.Series[int]": |
| return s.str.find(sub, start, end) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_find) |
| |
| def findall(self, pat: str, flags: int = 0) -> "ps.Series": |
| """ |
| Find all occurrences of pattern or regular expression in the Series. |
| |
| Equivalent to applying :func:`re.findall` to all the elements in |
| the Series. |
| |
| Parameters |
| ---------- |
| pat : str |
| Pattern or regular expression. |
| flags : int, default 0 (no flags) |
| `re` module flags, e.g. `re.IGNORECASE`. |
| |
| Returns |
| ------- |
| Series of object |
| All non-overlapping matches of pattern or regular expression in |
| each string of this Series. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['Lion', 'Monkey', 'Rabbit']) |
| |
| The search for the pattern ‘Monkey’ returns one match: |
| |
| >>> s.str.findall('Monkey') |
| 0 [] |
| 1 [Monkey] |
| 2 [] |
| dtype: object |
| |
| On the other hand, the search for the pattern ‘MONKEY’ doesn’t return |
| any match: |
| |
| >>> s.str.findall('MONKEY') |
| 0 [] |
| 1 [] |
| 2 [] |
| dtype: object |
| |
| Flags can be added to the pattern or regular expression. For instance, |
| to find the pattern ‘MONKEY’ ignoring the case: |
| |
| >>> import re |
| >>> s.str.findall('MONKEY', flags=re.IGNORECASE) |
| 0 [] |
| 1 [Monkey] |
| 2 [] |
| dtype: object |
| |
| When the pattern matches more than one string in the Series, all |
| matches are returned: |
| |
| >>> s.str.findall('on') |
| 0 [on] |
| 1 [on] |
| 2 [] |
| dtype: object |
| |
| Regular expressions are supported too. For instance, the search for all |
| the strings ending with the word ‘on’ is shown next: |
| |
| >>> s.str.findall('on$') |
| 0 [on] |
| 1 [] |
| 2 [] |
| dtype: object |
| |
| If the pattern is found more than once in the same string, then a list |
| of multiple strings is returned: |
| |
| >>> s.str.findall('b') |
| 0 [] |
| 1 [] |
| 2 [b, b] |
| dtype: object |
| """ |
| # type hint does not support to specify array type yet. |
| @pandas_udf(returnType=ArrayType(StringType(), containsNull=True)) # type: ignore |
| def pudf(s: pd.Series) -> pd.Series: |
| return s.str.findall(pat, flags) |
| |
| return self._data._with_new_scol(scol=pudf(self._data.spark.column)) |
| |
| def index(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series": |
| """ |
| Return lowest indexes in each strings where the substring is fully |
| contained between [start:end]. |
| |
| This is the same as :func:`str.find` except instead of returning -1, |
| it raises a ValueError when the substring is not found. Equivalent to |
| standard :func:`str.index`. |
| |
| Parameters |
| ---------- |
| sub : str |
| Substring being searched. |
| start : int |
| Left edge index. |
| end : int |
| Right edge index. |
| |
| Returns |
| ------- |
| Series of int |
| Series of lowest matching indexes. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['apple', 'oranges', 'bananas']) |
| |
| >>> s.str.index('a') |
| 0 0 |
| 1 2 |
| 2 1 |
| dtype: int64 |
| |
| The following expression throws an exception: |
| |
| >>> s.str.index('a', start=2) # doctest: +SKIP |
| """ |
| |
| @no_type_check |
| def pandas_index(s) -> "ps.Series[np.int64]": |
| return s.str.index(sub, start, end) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_index) |
| |
| def join(self, sep: str) -> "ps.Series": |
| """ |
| Join lists contained as elements in the Series with passed delimiter. |
| |
| If the elements of a Series are lists themselves, join the content of |
| these lists using the delimiter passed to the function. This function |
| is an equivalent to calling :func:`str.join` on the lists. |
| |
| Parameters |
| ---------- |
| sep : str |
| Delimiter to use between list entries. |
| |
| Returns |
| ------- |
| Series of object |
| Series with list entries concatenated by intervening occurrences of |
| the delimiter. |
| |
| See Also |
| -------- |
| str.split : Split strings around given separator/delimiter. |
| str.rsplit : Splits string around given separator/delimiter, |
| starting from the right. |
| |
| Examples |
| -------- |
| Example with a list that contains a None element. |
| |
| >>> s = ps.Series([['lion', 'elephant', 'zebra'], |
| ... ['cat', None, 'dog']]) |
| >>> s |
| 0 [lion, elephant, zebra] |
| 1 [cat, None, dog] |
| dtype: object |
| |
| Join all lists using a ‘-‘. The list containing None will produce None. |
| |
| >>> s.str.join('-') |
| 0 lion-elephant-zebra |
| 1 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_join(s) -> "ps.Series[str]": |
| return s.str.join(sep) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_join) |
| |
| def len(self) -> "ps.Series": |
| """ |
| Computes the length of each element in the Series. |
| |
| The element may be a sequence (such as a string, tuple or list). |
| |
| Returns |
| ------- |
| Series of int |
| A Series of integer values indicating the length of each element in |
| the Series. |
| |
| Examples |
| -------- |
| Returns the length (number of characters) in a string. Returns the |
| number of entries for lists or tuples. |
| |
| >>> s1 = ps.Series(['dog', 'monkey']) |
| >>> s1.str.len() |
| 0 3 |
| 1 6 |
| dtype: int64 |
| |
| >>> s2 = ps.Series([["a", "b", "c"], []]) |
| >>> s2.str.len() |
| 0 3 |
| 1 0 |
| dtype: int64 |
| """ |
| if isinstance(self._data.spark.data_type, (ArrayType, MapType)): |
| return self._data.spark.transform(lambda c: F.size(c).cast(LongType())) |
| else: |
| return self._data.spark.transform(lambda c: F.length(c).cast(LongType())) |
| |
| def ljust(self, width: int, fillchar: str = " ") -> "ps.Series": |
| """ |
| Filling right side of strings in the Series with an additional |
| character. Equivalent to :func:`str.ljust`. |
| |
| Parameters |
| ---------- |
| width : int |
| Minimum width of resulting string; additional characters will be |
| filled with `fillchar`. |
| fillchar : str |
| Additional character for filling, default is whitespace. |
| |
| Returns |
| ------- |
| Series of object |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["caribou", "tiger"]) |
| >>> s |
| 0 caribou |
| 1 tiger |
| dtype: object |
| |
| >>> s.str.ljust(width=10, fillchar='-') |
| 0 caribou--- |
| 1 tiger----- |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_ljust(s) -> "ps.Series[str]": |
| return s.str.ljust(width, fillchar) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_ljust) |
| |
| def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -> "ps.Series": |
| """ |
| Determine if each string matches a regular expression. |
| |
| Analogous to :func:`contains`, but more strict, relying on |
| :func:`re.match` instead of :func:`re.search`. |
| |
| Parameters |
| ---------- |
| pat : str |
| Character sequence or regular expression. |
| case : bool, default True |
| If True, case sensitive. |
| flags : int, default 0 (no flags) |
| Flags to pass through to the re module, e.g. re.IGNORECASE. |
| na : default NaN |
| Fill value for missing values. |
| |
| Returns |
| ------- |
| Series of boolean values or object |
| A Series of boolean values indicating whether the given pattern can |
| be matched in the string of each element of the Series. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) |
| >>> s.str.match('dog') |
| 0 False |
| 1 True |
| 2 False |
| 3 False |
| 4 None |
| dtype: object |
| |
| >>> s.str.match('mouse|dog', case=False) |
| 0 True |
| 1 True |
| 2 False |
| 3 False |
| 4 None |
| dtype: object |
| |
| >>> s.str.match('.+and.+', na=True) |
| 0 False |
| 1 False |
| 2 True |
| 3 False |
| 4 True |
| dtype: bool |
| |
| >>> import re |
| >>> s.str.match('MOUSE', flags=re.IGNORECASE) |
| 0 True |
| 1 False |
| 2 False |
| 3 False |
| 4 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_match(s) -> "ps.Series[bool]": |
| return s.str.match(pat, case, flags, na) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_match) |
| |
| def normalize(self, form: str) -> "ps.Series": |
| """ |
| Return the Unicode normal form for the strings in the Series. |
| |
| For more information on the forms, see the |
| :func:`unicodedata.normalize`. |
| |
| Parameters |
| ---------- |
| form : {‘NFC’, ‘NFKC’, ‘NFD’, ‘NFKD’} |
| Unicode form. |
| |
| Returns |
| ------- |
| Series of objects |
| A Series of normalized strings. |
| """ |
| |
| @no_type_check |
| def pandas_normalize(s) -> "ps.Series[str]": |
| return s.str.normalize(form) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_normalize) |
| |
| def pad(self, width: int, side: str = "left", fillchar: str = " ") -> "ps.Series": |
| """ |
| Pad strings in the Series up to width. |
| |
| Parameters |
| ---------- |
| width : int |
| Minimum width of resulting string; additional characters will be |
| filled with character defined in `fillchar`. |
| side : {‘left’, ‘right’, ‘both’}, default ‘left’ |
| Side from which to fill resulting string. |
| fillchar : str, default ' ' |
| Additional character for filling, default is whitespace. |
| |
| Returns |
| ------- |
| Series of object |
| Returns Series with minimum number of char in object. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["caribou", "tiger"]) |
| >>> s |
| 0 caribou |
| 1 tiger |
| dtype: object |
| |
| >>> s.str.pad(width=10) |
| 0 caribou |
| 1 tiger |
| dtype: object |
| |
| >>> s.str.pad(width=10, side='right', fillchar='-') |
| 0 caribou--- |
| 1 tiger----- |
| dtype: object |
| |
| >>> s.str.pad(width=10, side='both', fillchar='-') |
| 0 -caribou-- |
| 1 --tiger--- |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_pad(s) -> "ps.Series[str]": |
| return s.str.pad(width, side, fillchar) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_pad) |
| |
| def partition(self, sep: str = " ", expand: bool = True) -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| def repeat(self, repeats: int) -> "ps.Series": |
| """ |
| Duplicate each string in the Series. |
| |
| Parameters |
| ---------- |
| repeats : int |
| Repeat the string given number of times (int). Sequence of int |
| is not supported. |
| |
| Returns |
| ------- |
| Series of object |
| Series or Index of repeated string objects specified by input |
| parameter repeats. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['a', 'b', 'c']) |
| >>> s |
| 0 a |
| 1 b |
| 2 c |
| dtype: object |
| |
| Single int repeats string in Series |
| |
| >>> s.str.repeat(repeats=2) |
| 0 aa |
| 1 bb |
| 2 cc |
| dtype: object |
| """ |
| if not isinstance(repeats, int): |
| raise TypeError("repeats expects an int parameter") |
| return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats)) |
| |
| def replace( |
| self, |
| pat: str, |
| repl: Union[str, Callable[[str], str]], |
| n: int = -1, |
| case: Optional[bool] = None, |
| flags: int = 0, |
| regex: bool = True, |
| ) -> "ps.Series": |
| """ |
| Replace occurrences of pattern/regex in the Series with some other |
| string. Equivalent to :func:`str.replace` or :func:`re.sub`. |
| |
| Parameters |
| ---------- |
| pat : str or compiled regex |
| String can be a character sequence or regular expression. |
| repl : str or callable |
| Replacement string or a callable. The callable is passed the regex |
| match object and must return a replacement string to be used. See |
| :func:`re.sub`. |
| n : int, default -1 (all) |
| Number of replacements to make from start. |
| case : boolean, default None |
| If True, case sensitive (the default if pat is a string). |
| Set to False for case insensitive. |
| Cannot be set if pat is a compiled regex. |
| flags: int, default 0 (no flags) |
| re module flags, e.g. re.IGNORECASE. |
| Cannot be set if pat is a compiled regex. |
| regex : boolean, default True |
| If True, assumes the passed-in pattern is a regular expression. |
| If False, treats the pattern as a literal string. |
| Cannot be set to False if pat is a compile regex or repl is a |
| callable. |
| |
| Returns |
| ------- |
| Series of object |
| A copy of the string with all matching occurrences of pat replaced |
| by repl. |
| |
| Examples |
| -------- |
| When pat is a string and regex is True (the default), the given pat is |
| compiled as a regex. When repl is a string, it replaces matching regex |
| patterns as with :func:`re.sub`. NaN value(s) in the Series are changed |
| to None: |
| |
| >>> ps.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) |
| 0 bao |
| 1 baz |
| 2 None |
| dtype: object |
| |
| When pat is a string and regex is False, every pat is replaced with |
| repl as with :func:`str.replace`: |
| |
| >>> ps.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) |
| 0 bao |
| 1 fuz |
| 2 None |
| dtype: object |
| |
| When repl is a callable, it is called on every pat using |
| :func:`re.sub`. The callable should expect one positional argument (a |
| regex object) and return a string. |
| |
| Reverse every lowercase alphabetic word: |
| |
| >>> repl = lambda m: m.group(0)[::-1] |
| >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) |
| 0 oof 123 |
| 1 rab zab |
| 2 None |
| dtype: object |
| |
| Using regex groups (extract second group and swap case): |
| |
| >>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)" |
| >>> repl = lambda m: m.group('two').swapcase() |
| >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) |
| 0 tWO |
| 1 bAR |
| dtype: object |
| |
| Using a compiled regex with flags: |
| |
| >>> import re |
| >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) |
| >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') |
| 0 foo |
| 1 bar |
| 2 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_replace(s) -> "ps.Series[str]": |
| return s.str.replace(pat, repl, n=n, case=case, flags=flags, regex=regex) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_replace) |
| |
| def rfind(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series": |
| """ |
| Return highest indexes in each strings in the Series where the |
| substring is fully contained between [start:end]. |
| |
| Return -1 on failure. Equivalent to standard :func:`str.rfind`. |
| |
| Parameters |
| ---------- |
| sub : str |
| Substring being searched. |
| start : int |
| Left edge index. |
| end : int |
| Right edge index. |
| |
| Returns |
| ------- |
| Series of int |
| Series of highest matching indexes. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['apple', 'oranges', 'bananas']) |
| |
| >>> s.str.rfind('a') |
| 0 0 |
| 1 2 |
| 2 5 |
| dtype: int64 |
| |
| >>> s.str.rfind('a', start=2) |
| 0 -1 |
| 1 2 |
| 2 5 |
| dtype: int64 |
| |
| >>> s.str.rfind('a', end=1) |
| 0 0 |
| 1 -1 |
| 2 -1 |
| dtype: int64 |
| |
| >>> s.str.rfind('a', start=2, end=2) |
| 0 -1 |
| 1 -1 |
| 2 -1 |
| dtype: int64 |
| """ |
| |
| @no_type_check |
| def pandas_rfind(s) -> "ps.Series[int]": |
| return s.str.rfind(sub, start, end) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_rfind) |
| |
| def rindex(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series": |
| """ |
| Return highest indexes in each strings where the substring is fully |
| contained between [start:end]. |
| |
| This is the same as :func:`str.rfind` except instead of returning -1, |
| it raises a ValueError when the substring is not found. Equivalent to |
| standard :func:`str.rindex`. |
| |
| Parameters |
| ---------- |
| sub : str |
| Substring being searched. |
| start : int |
| Left edge index. |
| end : int |
| Right edge index. |
| |
| Returns |
| ------- |
| Series of int |
| Series of highest matching indexes. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['apple', 'oranges', 'bananas']) |
| |
| >>> s.str.rindex('a') |
| 0 0 |
| 1 2 |
| 2 5 |
| dtype: int64 |
| |
| The following expression throws an exception: |
| |
| >>> s.str.rindex('a', start=2) # doctest: +SKIP |
| """ |
| |
| @no_type_check |
| def pandas_rindex(s) -> "ps.Series[np.int64]": |
| return s.str.rindex(sub, start, end) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_rindex) |
| |
| def rjust(self, width: int, fillchar: str = " ") -> "ps.Series": |
| """ |
| Filling left side of strings in the Series with an additional |
| character. Equivalent to :func:`str.rjust`. |
| |
| Parameters |
| ---------- |
| width : int |
| Minimum width of resulting string; additional characters will be |
| filled with `fillchar`. |
| fillchar : str |
| Additional character for filling, default is whitespace. |
| |
| Returns |
| ------- |
| Series of object |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["caribou", "tiger"]) |
| >>> s |
| 0 caribou |
| 1 tiger |
| dtype: object |
| |
| >>> s.str.rjust(width=10) |
| 0 caribou |
| 1 tiger |
| dtype: object |
| |
| >>> s.str.rjust(width=10, fillchar='-') |
| 0 ---caribou |
| 1 -----tiger |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_rjust(s) -> "ps.Series[str]": |
| return s.str.rjust(width, fillchar) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_rjust) |
| |
| def rpartition(self, sep: str = " ", expand: bool = True) -> "ps.Series": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| def slice( |
| self, start: Optional[int] = None, stop: Optional[int] = None, step: Optional[int] = None |
| ) -> "ps.Series": |
| """ |
| Slice substrings from each element in the Series. |
| |
| Parameters |
| ---------- |
| start : int, optional |
| Start position for slice operation. |
| stop : int, optional |
| Stop position for slice operation. |
| step : int, optional |
| Step size for slice operation. |
| |
| Returns |
| ------- |
| Series of object |
| Series from sliced substrings from original string objects. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["koala", "fox", "chameleon"]) |
| >>> s |
| 0 koala |
| 1 fox |
| 2 chameleon |
| dtype: object |
| |
| >>> s.str.slice(start=1) |
| 0 oala |
| 1 ox |
| 2 hameleon |
| dtype: object |
| |
| >>> s.str.slice(stop=2) |
| 0 ko |
| 1 fo |
| 2 ch |
| dtype: object |
| |
| >>> s.str.slice(step=2) |
| 0 kaa |
| 1 fx |
| 2 caeen |
| dtype: object |
| |
| >>> s.str.slice(start=0, stop=5, step=3) |
| 0 kl |
| 1 f |
| 2 cm |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_slice(s) -> "ps.Series[str]": |
| return s.str.slice(start, stop, step) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_slice) |
| |
| def slice_replace( |
| self, start: Optional[int] = None, stop: Optional[int] = None, repl: Optional[str] = None |
| ) -> "ps.Series": |
| """ |
| Slice substrings from each element in the Series. |
| |
| Parameters |
| ---------- |
| start : int, optional |
| Start position for slice operation. If not specified (None), the |
| slice is unbounded on the left, i.e. slice from the start of the |
| string. |
| stop : int, optional |
| Stop position for slice operation. If not specified (None), the |
| slice is unbounded on the right, i.e. slice until the end of the |
| string. |
| repl : str, optional |
| String for replacement. If not specified (None), the sliced region |
| is replaced with an empty string. |
| |
| Returns |
| ------- |
| Series of object |
| Series from sliced substrings from original string objects. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) |
| >>> s |
| 0 a |
| 1 ab |
| 2 abc |
| 3 abdc |
| 4 abcde |
| dtype: object |
| |
| Specify just start, meaning replace start until the end of the string |
| with repl. |
| |
| >>> s.str.slice_replace(1, repl='X') |
| 0 aX |
| 1 aX |
| 2 aX |
| 3 aX |
| 4 aX |
| dtype: object |
| |
| Specify just stop, meaning the start of the string to stop is replaced |
| with repl, and the rest of the string is included. |
| |
| >>> s.str.slice_replace(stop=2, repl='X') |
| 0 X |
| 1 X |
| 2 Xc |
| 3 Xdc |
| 4 Xcde |
| dtype: object |
| |
| Specify start and stop, meaning the slice from start to stop is |
| replaced with repl. Everything before or after start and stop is |
| included as is. |
| |
| >>> s.str.slice_replace(start=1, stop=3, repl='X') |
| 0 aX |
| 1 aX |
| 2 aX |
| 3 aXc |
| 4 aXde |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_slice_replace(s) -> "ps.Series[str]": |
| return s.str.slice_replace(start, stop, repl) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_slice_replace) |
| |
| def split( |
| self, pat: Optional[str] = None, n: int = -1, expand: bool = False |
| ) -> Union["ps.Series", "ps.DataFrame"]: |
| """ |
| Split strings around given separator/delimiter. |
| |
| Splits the string in the Series from the beginning, at the specified |
| delimiter string. Equivalent to :func:`str.split`. |
| |
| Parameters |
| ---------- |
| pat : str, optional |
| String or regular expression to split on. If not specified, split |
| on whitespace. |
| n : int, default -1 (all) |
| Limit number of splits in output. None, 0 and -1 will be |
| interpreted as return all splits. |
| expand : bool, default False |
| Expand the splitted strings into separate columns. |
| |
| * If ``True``, `n` must be a positive integer, and return DataFrame expanding |
| dimensionality. |
| * If ``False``, return Series, containing lists of strings. |
| |
| Returns |
| ------- |
| Series, DataFrame |
| Type matches caller unless `expand=True` (see Notes). |
| |
| See Also |
| -------- |
| str.rsplit : Splits string around given separator/delimiter, |
| starting from the right. |
| str.join : Join lists contained as elements in the Series/Index |
| with passed delimiter. |
| |
| Notes |
| ----- |
| The handling of the `n` keyword depends on the number of found splits: |
| |
| - If found splits > `n`, make first `n` splits only |
| - If found splits <= `n`, make all splits |
| - If for a certain row the number of found splits < `n`, |
| append `None` for padding up to `n` if ``expand=True`` |
| |
| If using ``expand=True``, Series callers return DataFrame objects with `n + 1` columns. |
| |
| .. note:: Even if `n` is much larger than found splits, the number of columns does NOT |
| shrink unlike pandas. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["this is a regular sentence", |
| ... "https://docs.python.org/3/tutorial/index.html", |
| ... np.nan]) |
| |
| In the default setting, the string is split by whitespace. |
| |
| >>> s.str.split() |
| 0 [this, is, a, regular, sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| Without the n parameter, the outputs of rsplit and split are identical. |
| |
| >>> s.str.rsplit() |
| 0 [this, is, a, regular, sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| The n parameter can be used to limit the number of splits on the |
| delimiter. The outputs of split and rsplit are different. |
| |
| >>> s.str.split(n=2) |
| 0 [this, is, a regular sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| >>> s.str.rsplit(n=2) |
| 0 [this is a, regular, sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| The pat parameter can be used to split by other characters. |
| |
| >>> s.str.split(pat = "/") |
| 0 [this is a regular sentence] |
| 1 [https:, , docs.python.org, 3, tutorial, index... |
| 2 None |
| dtype: object |
| |
| When using ``expand=True``, the split elements will expand out into |
| separate columns. If NaN is present, it is propagated throughout |
| the columns during the split. |
| |
| >>> s.str.split(n=4, expand=True) |
| 0 1 2 3 4 |
| 0 this is a regular sentence |
| 1 https://docs.python.org/3/tutorial/index.html None None None None |
| 2 None None None None None |
| |
| For slightly more complex use cases like splitting the html document name |
| from a url, a combination of parameter settings can be used. |
| |
| >>> s.str.rsplit("/", n=1, expand=True) |
| 0 1 |
| 0 this is a regular sentence None |
| 1 https://docs.python.org/3/tutorial index.html |
| 2 None None |
| |
| Remember to escape special characters when explicitly using regular |
| expressions. |
| |
| >>> s = ps.Series(["1+1=2"]) |
| >>> s.str.split(r"\\+|=", n=2, expand=True) |
| 0 1 2 |
| 0 1 1 2 |
| """ |
| from pyspark.pandas.frame import DataFrame |
| |
| if expand and n <= 0: |
| raise NotImplementedError("expand=True is currently only supported with n > 0.") |
| |
| # type hint does not support to specify array type yet. |
| return_type = ArrayType(StringType(), containsNull=True) |
| |
| @pandas_udf(returnType=return_type) # type: ignore |
| def pudf(s: pd.Series) -> pd.Series: |
| return s.str.split(pat, n) |
| |
| psser = self._data._with_new_scol( |
| pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]), |
| field=self._data._internal.data_fields[0].copy(spark_type=return_type, nullable=True), |
| ) |
| |
| if expand: |
| psdf = psser.to_frame() |
| scol = psdf._internal.data_spark_columns[0] |
| spark_columns = [scol[i].alias(str(i)) for i in range(n + 1)] |
| column_labels = [(i,) for i in range(n + 1)] |
| internal = psdf._internal.with_new_columns( |
| spark_columns, |
| column_labels=cast(Optional[List], column_labels), |
| data_fields=[ |
| self._data._internal.data_fields[0].copy(name=str(i), nullable=True) |
| for i in range(n + 1) |
| ], |
| ) |
| return DataFrame(internal) |
| else: |
| return psser |
| |
| def rsplit( |
| self, pat: Optional[str] = None, n: int = -1, expand: bool = False |
| ) -> Union["ps.Series", "ps.DataFrame"]: |
| """ |
| Split strings around given separator/delimiter. |
| |
| Splits the string in the Series from the end, at the specified |
| delimiter string. Equivalent to :func:`str.rsplit`. |
| |
| Parameters |
| ---------- |
| pat : str, optional |
| String or regular expression to split on. If not specified, split |
| on whitespace. |
| n : int, default -1 (all) |
| Limit number of splits in output. None, 0 and -1 will be |
| interpreted as return all splits. |
| expand : bool, default False |
| Expand the splitted strings into separate columns. |
| |
| * If ``True``, `n` must be a positive integer, and return DataFrame expanding |
| dimensionality. |
| * If ``False``, return Series, containing lists of strings. |
| |
| Returns |
| ------- |
| Series, DataFrame |
| Type matches caller unless `expand=True` (see Notes). |
| |
| See Also |
| -------- |
| str.split : Split strings around given separator/delimiter. |
| str.join : Join lists contained as elements in the Series/Index |
| with passed delimiter. |
| |
| Notes |
| ----- |
| The handling of the `n` keyword depends on the number of found splits: |
| |
| - If found splits > `n`, make first `n` splits only |
| - If found splits <= `n`, make all splits |
| - If for a certain row the number of found splits < `n`, |
| append `None` for padding up to `n` if ``expand=True`` |
| |
| If using ``expand=True``, Series callers return DataFrame objects with `n + 1` columns. |
| |
| .. note:: Even if `n` is much larger than found splits, the number of columns does NOT |
| shrink unlike pandas. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["this is a regular sentence", |
| ... "https://docs.python.org/3/tutorial/index.html", |
| ... np.nan]) |
| |
| In the default setting, the string is split by whitespace. |
| |
| >>> s.str.split() |
| 0 [this, is, a, regular, sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| Without the n parameter, the outputs of rsplit and split are identical. |
| |
| >>> s.str.rsplit() |
| 0 [this, is, a, regular, sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| The n parameter can be used to limit the number of splits on the |
| delimiter. The outputs of split and rsplit are different. |
| |
| >>> s.str.split(n=2) |
| 0 [this, is, a regular sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| >>> s.str.rsplit(n=2) |
| 0 [this is a, regular, sentence] |
| 1 [https://docs.python.org/3/tutorial/index.html] |
| 2 None |
| dtype: object |
| |
| When using ``expand=True``, the split elements will expand out into |
| separate columns. If NaN is present, it is propagated throughout |
| the columns during the split. |
| |
| >>> s.str.split(n=4, expand=True) |
| 0 1 2 3 4 |
| 0 this is a regular sentence |
| 1 https://docs.python.org/3/tutorial/index.html None None None None |
| 2 None None None None None |
| |
| For slightly more complex use cases like splitting the html document name |
| from a url, a combination of parameter settings can be used. |
| |
| >>> s.str.rsplit("/", n=1, expand=True) |
| 0 1 |
| 0 this is a regular sentence None |
| 1 https://docs.python.org/3/tutorial index.html |
| 2 None None |
| |
| Remember to escape special characters when explicitly using regular |
| expressions. |
| |
| >>> s = ps.Series(["1+1=2"]) |
| >>> s.str.split(r"\\+|=", n=2, expand=True) |
| 0 1 2 |
| 0 1 1 2 |
| """ |
| from pyspark.pandas.frame import DataFrame |
| |
| if expand and n <= 0: |
| raise NotImplementedError("expand=True is currently only supported with n > 0.") |
| |
| # type hint does not support to specify array type yet. |
| return_type = ArrayType(StringType(), containsNull=True) |
| |
| @pandas_udf(returnType=return_type) # type: ignore |
| def pudf(s: pd.Series) -> pd.Series: |
| return s.str.rsplit(pat, n) |
| |
| psser = self._data._with_new_scol( |
| pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]), |
| field=self._data._internal.data_fields[0].copy(spark_type=return_type, nullable=True), |
| ) |
| |
| if expand: |
| psdf = psser.to_frame() |
| scol = psdf._internal.data_spark_columns[0] |
| spark_columns = [scol[i].alias(str(i)) for i in range(n + 1)] |
| column_labels = [(i,) for i in range(n + 1)] |
| internal = psdf._internal.with_new_columns( |
| spark_columns, |
| column_labels=cast(Optional[List], column_labels), |
| data_fields=[ |
| self._data._internal.data_fields[0].copy(name=str(i), nullable=True) |
| for i in range(n + 1) |
| ], |
| ) |
| return DataFrame(internal) |
| else: |
| return psser |
| |
| def translate(self, table: Dict) -> "ps.Series": |
| """ |
| Map all characters in the string through the given mapping table. |
| Equivalent to standard :func:`str.translate`. |
| |
| Parameters |
| ---------- |
| table : dict |
| Table is a mapping of Unicode ordinals to Unicode ordinals, |
| strings, or None. Unmapped characters are left untouched. |
| Characters mapped to None are deleted. :func:`str.maketrans` is a |
| helper function for making translation tables. |
| |
| Returns |
| ------- |
| Series of object |
| Series with translated strings. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(["dog", "cat", "bird"]) |
| >>> m = str.maketrans({'a': 'X', 'i': 'Y', 'o': None}) |
| >>> s.str.translate(m) |
| 0 dg |
| 1 cXt |
| 2 bYrd |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_translate(s) -> "ps.Series[str]": |
| return s.str.translate(table) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_translate) |
| |
| def wrap(self, width: int, **kwargs: bool) -> "ps.Series": |
| """ |
| Wrap long strings in the Series to be formatted in paragraphs with |
| length less than a given width. |
| |
| This method has the same keyword parameters and defaults as |
| :class:`textwrap.TextWrapper`. |
| |
| Parameters |
| ---------- |
| width : int |
| Maximum line-width. Lines separated with newline char. |
| expand_tabs : bool, optional |
| If true, tab characters will be expanded to spaces (default: True). |
| replace_whitespace : bool, optional |
| If true, each whitespace character remaining after tab expansion |
| will be replaced by a single space (default: True). |
| drop_whitespace : bool, optional |
| If true, whitespace that, after wrapping, happens to end up at the |
| beginning or end of a line is dropped (default: True). |
| break_long_words : bool, optional |
| If true, then words longer than width will be broken in order to |
| ensure that no lines are longer than width. If it is false, long |
| words will not be broken, and some lines may be longer than width |
| (default: True). |
| break_on_hyphens : bool, optional |
| If true, wrapping will occur preferably on whitespace and right |
| after hyphens in compound words, as it is customary in English. |
| If false, only whitespaces will be considered as potentially good |
| places for line breaks, but you need to set break_long_words to |
| false if you want truly insecable words (default: True). |
| |
| Returns |
| ------- |
| Series of object |
| Series with wrapped strings. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['line to be wrapped', 'another line to be wrapped']) |
| >>> s.str.wrap(12) |
| 0 line to be\\nwrapped |
| 1 another line\\nto be\\nwrapped |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_wrap(s) -> "ps.Series[str]": |
| return s.str.wrap(width, **kwargs) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_wrap) |
| |
| def zfill(self, width: int) -> "ps.Series": |
| """ |
| Pad strings in the Series by prepending ‘0’ characters. |
| |
| Strings in the Series are padded with ‘0’ characters on the left of the |
| string to reach a total string length width. Strings in the Series with |
| length greater or equal to width are unchanged. |
| |
| Differs from :func:`str.zfill` which has special handling for ‘+’/’-‘ |
| in the string. |
| |
| Parameters |
| ---------- |
| width : int |
| Minimum length of resulting string; strings with length less than |
| width be prepended with ‘0’ characters. |
| |
| Returns |
| ------- |
| Series of object |
| Series with '0' left-padded strings. |
| |
| Examples |
| -------- |
| >>> s = ps.Series(['-1', '1', '1000', np.nan]) |
| >>> s |
| 0 -1 |
| 1 1 |
| 2 1000 |
| 3 None |
| dtype: object |
| |
| Note that NaN is not a string, therefore it is converted to NaN. The |
| minus sign in '-1' is treated as a regular character and the zero is |
| added to the left of it (:func:`str.zfill` would have moved it to the |
| left). 1000 remains unchanged as it is longer than width. |
| |
| >>> s.str.zfill(3) |
| 0 0-1 |
| 1 001 |
| 2 1000 |
| 3 None |
| dtype: object |
| """ |
| |
| @no_type_check |
| def pandas_zfill(s) -> "ps.Series[str]": |
| return s.str.zfill(width) |
| |
| return self._data.pandas_on_spark.transform_batch(pandas_zfill) |
| |
| @no_type_check |
| def get_dummies(self, sep: str = "|") -> "ps.DataFrame": |
| """ |
| Not supported. |
| """ |
| raise NotImplementedError() |
| |
| |
| def _test() -> None: |
| import os |
| import doctest |
| import sys |
| from pyspark.sql import SparkSession |
| import pyspark.pandas.strings |
| |
| os.chdir(os.environ["SPARK_HOME"]) |
| |
| globs = pyspark.pandas.strings.__dict__.copy() |
| globs["ps"] = pyspark.pandas |
| spark = ( |
| SparkSession.builder.master("local[4]") |
| .appName("pyspark.pandas.strings tests") |
| .getOrCreate() |
| ) |
| (failure_count, test_count) = doctest.testmod( |
| pyspark.pandas.strings, |
| globs=globs, |
| optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, |
| ) |
| spark.stop() |
| if failure_count: |
| sys.exit(-1) |
| |
| |
| if __name__ == "__main__": |
| _test() |