blob: c8df90b3e231bb67fcb5be85365cfb50720717cc [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
String functions on pandas-on-Spark Series
"""
from typing import (
Any,
Callable,
Dict,
List,
Optional,
Union,
TYPE_CHECKING,
cast,
no_type_check,
)
import numpy as np
import pandas as pd
from pyspark.sql.types import StringType, BinaryType, ArrayType, LongType, MapType
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf
from pyspark.pandas.spark import functions as SF
if TYPE_CHECKING:
import pyspark.pandas as ps # noqa: F401 (SPARK-34943)
class StringMethods(object):
"""String methods for pandas-on-Spark Series"""
def __init__(self, series: "ps.Series"):
if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
self._data = series
# Methods
def capitalize(self) -> "ps.Series":
"""
Convert Strings in the series to be capitalized.
Examples
--------
>>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
>>> s
0 lower
1 CAPITALS
2 this is a sentence
3 SwApCaSe
dtype: object
>>> s.str.capitalize()
0 Lower
1 Capitals
2 This is a sentence
3 Swapcase
dtype: object
"""
@no_type_check
def pandas_capitalize(s) -> "ps.Series[str]":
return s.str.capitalize()
return self._data.pandas_on_spark.transform_batch(pandas_capitalize)
def title(self) -> "ps.Series":
"""
Convert Strings in the series to be titlecase.
Examples
--------
>>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
>>> s
0 lower
1 CAPITALS
2 this is a sentence
3 SwApCaSe
dtype: object
>>> s.str.title()
0 Lower
1 Capitals
2 This Is A Sentence
3 Swapcase
dtype: object
"""
@no_type_check
def pandas_title(s) -> "ps.Series[str]":
return s.str.title()
return self._data.pandas_on_spark.transform_batch(pandas_title)
def lower(self) -> "ps.Series":
"""
Convert strings in the Series/Index to all lowercase.
Examples
--------
>>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
>>> s
0 lower
1 CAPITALS
2 this is a sentence
3 SwApCaSe
dtype: object
>>> s.str.lower()
0 lower
1 capitals
2 this is a sentence
3 swapcase
dtype: object
"""
return self._data.spark.transform(F.lower)
def upper(self) -> "ps.Series":
"""
Convert strings in the Series/Index to all uppercase.
Examples
--------
>>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
>>> s
0 lower
1 CAPITALS
2 this is a sentence
3 SwApCaSe
dtype: object
>>> s.str.upper()
0 LOWER
1 CAPITALS
2 THIS IS A SENTENCE
3 SWAPCASE
dtype: object
"""
return self._data.spark.transform(F.upper)
def swapcase(self) -> "ps.Series":
"""
Convert strings in the Series/Index to be swapcased.
Examples
--------
>>> s = ps.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
>>> s
0 lower
1 CAPITALS
2 this is a sentence
3 SwApCaSe
dtype: object
>>> s.str.swapcase()
0 LOWER
1 capitals
2 THIS IS A SENTENCE
3 sWaPcAsE
dtype: object
"""
@no_type_check
def pandas_swapcase(s) -> "ps.Series[str]":
return s.str.swapcase()
return self._data.pandas_on_spark.transform_batch(pandas_swapcase)
def startswith(self, pattern: str, na: Optional[Any] = None) -> "ps.Series":
"""
Test if the start of each string element matches a pattern.
Equivalent to :func:`str.startswith`.
Parameters
----------
pattern : str
Character sequence. Regular expressions are not accepted.
na : object, default None
Object shown if element is not a string. NaN converted to None.
Returns
-------
Series of bool or object
pandas-on-Spark Series of booleans indicating whether the given pattern
matches the start of each string element.
Examples
--------
>>> s = ps.Series(['bat', 'Bear', 'cat', np.nan])
>>> s
0 bat
1 Bear
2 cat
3 None
dtype: object
>>> s.str.startswith('b')
0 True
1 False
2 False
3 None
dtype: object
Specifying na to be False instead of None.
>>> s.str.startswith('b', na=False)
0 True
1 False
2 False
3 False
dtype: bool
"""
@no_type_check
def pandas_startswith(s) -> "ps.Series[bool]":
return s.str.startswith(pattern, na)
return self._data.pandas_on_spark.transform_batch(pandas_startswith)
def endswith(self, pattern: str, na: Optional[Any] = None) -> "ps.Series":
"""
Test if the end of each string element matches a pattern.
Equivalent to :func:`str.endswith`.
Parameters
----------
pattern : str
Character sequence. Regular expressions are not accepted.
na : object, default None
Object shown if element is not a string. NaN converted to None.
Returns
-------
Series of bool or object
pandas-on-Spark Series of booleans indicating whether the given pattern
matches the end of each string element.
Examples
--------
>>> s = ps.Series(['bat', 'Bear', 'cat', np.nan])
>>> s
0 bat
1 Bear
2 cat
3 None
dtype: object
>>> s.str.endswith('t')
0 True
1 False
2 True
3 None
dtype: object
Specifying na to be False instead of None.
>>> s.str.endswith('t', na=False)
0 True
1 False
2 True
3 False
dtype: bool
"""
@no_type_check
def pandas_endswith(s) -> "ps.Series[bool]":
return s.str.endswith(pattern, na)
return self._data.pandas_on_spark.transform_batch(pandas_endswith)
def strip(self, to_strip: Optional[str] = None) -> "ps.Series":
"""
Remove leading and trailing characters.
Strip whitespaces (including newlines) or a set of specified
characters from each string in the Series/Index from left and
right sides. Equivalent to :func:`str.strip`.
Parameters
----------
to_strip : str
Specifying the set of characters to be removed. All combinations
of this set of characters will be stripped. If None then
whitespaces are removed.
Returns
-------
Series of objects
Examples
--------
>>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None])
>>> s
0 1. Ant.
1 2. Bee!\\t
2 None
dtype: object
>>> s.str.strip()
0 1. Ant.
1 2. Bee!
2 None
dtype: object
>>> s.str.strip('12.')
0 Ant
1 Bee!\\t
2 None
dtype: object
>>> s.str.strip('.!\\t')
0 1. Ant
1 2. Bee
2 None
dtype: object
"""
@no_type_check
def pandas_strip(s) -> "ps.Series[str]":
return s.str.strip(to_strip)
return self._data.pandas_on_spark.transform_batch(pandas_strip)
def lstrip(self, to_strip: Optional[str] = None) -> "ps.Series":
"""
Remove leading characters.
Strip whitespaces (including newlines) or a set of specified
characters from each string in the Series/Index from left side.
Equivalent to :func:`str.lstrip`.
Parameters
----------
to_strip : str
Specifying the set of characters to be removed. All combinations
of this set of characters will be stripped. If None then
whitespaces are removed.
Returns
-------
Series of object
Examples
--------
>>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None])
>>> s
0 1. Ant.
1 2. Bee!\\t
2 None
dtype: object
>>> s.str.lstrip('12.')
0 Ant.
1 Bee!\\t
2 None
dtype: object
"""
@no_type_check
def pandas_lstrip(s) -> "ps.Series[str]":
return s.str.lstrip(to_strip)
return self._data.pandas_on_spark.transform_batch(pandas_lstrip)
def rstrip(self, to_strip: Optional[str] = None) -> "ps.Series":
"""
Remove trailing characters.
Strip whitespaces (including newlines) or a set of specified
characters from each string in the Series/Index from right side.
Equivalent to :func:`str.rstrip`.
Parameters
----------
to_strip : str
Specifying the set of characters to be removed. All combinations
of this set of characters will be stripped. If None then
whitespaces are removed.
Returns
-------
Series of object
Examples
--------
>>> s = ps.Series(['1. Ant.', '2. Bee!\\t', None])
>>> s
0 1. Ant.
1 2. Bee!\\t
2 None
dtype: object
>>> s.str.rstrip('.!\\t')
0 1. Ant
1 2. Bee
2 None
dtype: object
"""
@no_type_check
def pandas_rstrip(s) -> "ps.Series[str]":
return s.str.rstrip(to_strip)
return self._data.pandas_on_spark.transform_batch(pandas_rstrip)
def get(self, i: int) -> "ps.Series":
"""
Extract element from each string or string list/tuple in the Series
at the specified position.
Parameters
----------
i : int
Position of element to extract.
Returns
-------
Series of objects
Examples
--------
>>> s1 = ps.Series(["String", "123"])
>>> s1
0 String
1 123
dtype: object
>>> s1.str.get(1)
0 t
1 2
dtype: object
>>> s1.str.get(-1)
0 g
1 3
dtype: object
>>> s2 = ps.Series([["a", "b", "c"], ["x", "y"]])
>>> s2
0 [a, b, c]
1 [x, y]
dtype: object
>>> s2.str.get(0)
0 a
1 x
dtype: object
>>> s2.str.get(2)
0 c
1 None
dtype: object
"""
@no_type_check
def pandas_get(s) -> "ps.Series[str]":
return s.str.get(i)
return self._data.pandas_on_spark.transform_batch(pandas_get)
def isalnum(self) -> "ps.Series":
"""
Check whether all characters in each string are alphanumeric.
This is equivalent to running the Python string method
:func:`str.isalnum` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s1 = ps.Series(['one', 'one1', '1', ''])
>>> s1.str.isalnum()
0 True
1 True
2 True
3 False
dtype: bool
Note that checks against characters mixed with any additional
punctuation or whitespace will evaluate to false for an alphanumeric
check.
>>> s2 = ps.Series(['A B', '1.5', '3,000'])
>>> s2.str.isalnum()
0 False
1 False
2 False
dtype: bool
"""
@no_type_check
def pandas_isalnum(s) -> "ps.Series[bool]":
return s.str.isalnum()
return self._data.pandas_on_spark.transform_batch(pandas_isalnum)
def isalpha(self) -> "ps.Series":
"""
Check whether all characters in each string are alphabetic.
This is equivalent to running the Python string method
:func:`str.isalpha` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s1 = ps.Series(['one', 'one1', '1', ''])
>>> s1.str.isalpha()
0 True
1 False
2 False
3 False
dtype: bool
"""
@no_type_check
def pandas_isalpha(s) -> "ps.Series[bool]":
return s.str.isalpha()
return self._data.pandas_on_spark.transform_batch(pandas_isalpha)
def isdigit(self) -> "ps.Series":
"""
Check whether all characters in each string are digits.
This is equivalent to running the Python string method
:func:`str.isdigit` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s = ps.Series(['23', '³', '⅕', ''])
The s.str.isdecimal method checks for characters used to form numbers
in base 10.
>>> s.str.isdecimal()
0 True
1 False
2 False
3 False
dtype: bool
The s.str.isdigit method is the same as s.str.isdecimal but also
includes special digits, like superscripted and subscripted digits in
unicode.
>>> s.str.isdigit()
0 True
1 True
2 False
3 False
dtype: bool
The s.str.isnumeric method is the same as s.str.isdigit but also
includes other characters that can represent quantities such as unicode
fractions.
>>> s.str.isnumeric()
0 True
1 True
2 True
3 False
dtype: bool
"""
@no_type_check
def pandas_isdigit(s) -> "ps.Series[bool]":
return s.str.isdigit()
return self._data.pandas_on_spark.transform_batch(pandas_isdigit)
def isspace(self) -> "ps.Series":
"""
Check whether all characters in each string are whitespaces.
This is equivalent to running the Python string method
:func:`str.isspace` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s = ps.Series([' ', '\\t\\r\\n ', ''])
>>> s.str.isspace()
0 True
1 True
2 False
dtype: bool
"""
@no_type_check
def pandas_isspace(s) -> "ps.Series[bool]":
return s.str.isspace()
return self._data.pandas_on_spark.transform_batch(pandas_isspace)
def islower(self) -> "ps.Series":
"""
Check whether all characters in each string are lowercase.
This is equivalent to running the Python string method
:func:`str.islower` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
>>> s.str.islower()
0 True
1 False
2 False
3 False
dtype: bool
"""
@no_type_check
def pandas_isspace(s) -> "ps.Series[bool]":
return s.str.islower()
return self._data.pandas_on_spark.transform_batch(pandas_isspace)
def isupper(self) -> "ps.Series":
"""
Check whether all characters in each string are uppercase.
This is equivalent to running the Python string method
:func:`str.isupper` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
>>> s.str.isupper()
0 False
1 False
2 True
3 False
dtype: bool
"""
@no_type_check
def pandas_isspace(s) -> "ps.Series[bool]":
return s.str.isupper()
return self._data.pandas_on_spark.transform_batch(pandas_isspace)
def istitle(self) -> "ps.Series":
"""
Check whether all characters in each string are titlecase.
This is equivalent to running the Python string method
:func:`str.istitle` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s = ps.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
The s.str.istitle method checks for whether all words are in title
case (whether only the first letter of each word is capitalized).
Words are assumed to be as any sequence of non-numeric characters
separated by whitespace characters.
>>> s.str.istitle()
0 False
1 True
2 False
3 False
dtype: bool
"""
@no_type_check
def pandas_istitle(s) -> "ps.Series[bool]":
return s.str.istitle()
return self._data.pandas_on_spark.transform_batch(pandas_istitle)
def isnumeric(self) -> "ps.Series":
"""
Check whether all characters in each string are numeric.
This is equivalent to running the Python string method
:func:`str.isnumeric` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s1 = ps.Series(['one', 'one1', '1', ''])
>>> s1.str.isnumeric()
0 False
1 False
2 True
3 False
dtype: bool
>>> s2 = ps.Series(['23', '³', '⅕', ''])
The s2.str.isdecimal method checks for characters used to form numbers
in base 10.
>>> s2.str.isdecimal()
0 True
1 False
2 False
3 False
dtype: bool
The s2.str.isdigit method is the same as s2.str.isdecimal but also
includes special digits, like superscripted and subscripted digits in
unicode.
>>> s2.str.isdigit()
0 True
1 True
2 False
3 False
dtype: bool
The s2.str.isnumeric method is the same as s2.str.isdigit but also
includes other characters that can represent quantities such as unicode
fractions.
>>> s2.str.isnumeric()
0 True
1 True
2 True
3 False
dtype: bool
"""
@no_type_check
def pandas_isnumeric(s) -> "ps.Series[bool]":
return s.str.isnumeric()
return self._data.pandas_on_spark.transform_batch(pandas_isnumeric)
def isdecimal(self) -> "ps.Series":
"""
Check whether all characters in each string are decimals.
This is equivalent to running the Python string method
:func:`str.isdecimal` for each element of the Series/Index.
If a string has zero characters, False is returned for that check.
Examples
--------
>>> s = ps.Series(['23', '³', '⅕', ''])
The s.str.isdecimal method checks for characters used to form numbers
in base 10.
>>> s.str.isdecimal()
0 True
1 False
2 False
3 False
dtype: bool
The s.str.isdigit method is the same as s.str.isdecimal but also
includes special digits, like superscripted and subscripted digits in
unicode.
>>> s.str.isdigit()
0 True
1 True
2 False
3 False
dtype: bool
The s.str.isnumeric method is the same as s.str.isdigit but also
includes other characters that can represent quantities such as unicode
fractions.
>>> s.str.isnumeric()
0 True
1 True
2 True
3 False
dtype: bool
"""
@no_type_check
def pandas_isdecimal(s) -> "ps.Series[bool]":
return s.str.isdecimal()
return self._data.pandas_on_spark.transform_batch(pandas_isdecimal)
@no_type_check
def cat(self, others=None, sep=None, na_rep=None, join=None) -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
def center(self, width: int, fillchar: str = " ") -> "ps.Series":
"""
Filling left and right side of strings in the Series/Index with an
additional character. Equivalent to :func:`str.center`.
Parameters
----------
width : int
Minimum width of resulting string; additional characters will be
filled with fillchar.
fillchar : str
Additional character for filling, default is whitespace.
Returns
-------
Series of objects
Examples
--------
>>> s = ps.Series(["caribou", "tiger"])
>>> s
0 caribou
1 tiger
dtype: object
>>> s.str.center(width=10, fillchar='-')
0 -caribou--
1 --tiger---
dtype: object
"""
@no_type_check
def pandas_center(s) -> "ps.Series[str]":
return s.str.center(width, fillchar)
return self._data.pandas_on_spark.transform_batch(pandas_center)
def contains(
self, pat: str, case: bool = True, flags: int = 0, na: Any = None, regex: bool = True
) -> "ps.Series":
"""
Test if pattern or regex is contained within a string of a Series.
Return boolean Series based on whether a given pattern or regex is
contained within a string of a Series.
Analogous to :func:`match`, but less strict, relying on
:func:`re.search` instead of :func:`re.match`.
Parameters
----------
pat : str
Character sequence or regular expression.
case : bool, default True
If True, case sensitive.
flags : int, default 0 (no flags)
Flags to pass through to the re module, e.g. re.IGNORECASE.
na : default None
Fill value for missing values. NaN converted to None.
regex : bool, default True
If True, assumes the pat is a regular expression.
If False, treats the pat as a literal string.
Returns
-------
Series of boolean values or object
A Series of boolean values indicating whether the given pattern is
contained within the string of each element of the Series.
Examples
--------
Returning a Series of booleans using only a literal pattern.
>>> s1 = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
>>> s1.str.contains('og', regex=False)
0 False
1 True
2 False
3 False
4 None
dtype: object
Specifying case sensitivity using case.
>>> s1.str.contains('oG', case=True, regex=True)
0 False
1 False
2 False
3 False
4 None
dtype: object
Specifying na to be False instead of NaN replaces NaN values with
False. If Series does not contain NaN values the resultant dtype will
be bool, otherwise, an object dtype.
>>> s1.str.contains('og', na=False, regex=True)
0 False
1 True
2 False
3 False
4 False
dtype: bool
Returning ‘house’ or ‘dog’ when either expression occurs in a string.
>>> s1.str.contains('house|dog', regex=True)
0 False
1 True
2 True
3 False
4 None
dtype: object
Ignoring case sensitivity using flags with regex.
>>> import re
>>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
0 False
1 False
2 True
3 False
4 None
dtype: object
Returning any digit using regular expression.
>>> s1.str.contains('[0-9]', regex=True)
0 False
1 False
2 False
3 True
4 None
dtype: object
Ensure pat is a not a literal pattern when regex is set to True.
Note in the following example one might expect only s2[1] and s2[3]
to return True. However, ‘.0’ as a regex matches any character followed
by a 0.
>>> s2 = ps.Series(['40','40.0','41','41.0','35'])
>>> s2.str.contains('.0', regex=True)
0 True
1 True
2 False
3 True
4 False
dtype: bool
"""
@no_type_check
def pandas_contains(s) -> "ps.Series[bool]":
return s.str.contains(pat, case, flags, na, regex)
return self._data.pandas_on_spark.transform_batch(pandas_contains)
def count(self, pat: str, flags: int = 0) -> "ps.Series":
"""
Count occurrences of pattern in each string of the Series.
This function is used to count the number of times a particular regex
pattern is repeated in each of the string elements of the Series.
Parameters
----------
pat : str
Valid regular expression.
flags : int, default 0 (no flags)
Flags for the re module.
Returns
-------
Series of int
A Series containing the integer counts of pattern matches.
Examples
--------
>>> s = ps.Series(['A', 'B', 'Aaba', 'Baca', np.NaN, 'CABA', 'cat'])
>>> s.str.count('a')
0 0.0
1 0.0
2 2.0
3 2.0
4 NaN
5 0.0
6 1.0
dtype: float64
Escape '$' to find the literal dollar sign.
>>> s = ps.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
>>> s.str.count('\\$')
0 1
1 0
2 1
3 2
4 2
5 0
dtype: int64
"""
@no_type_check
def pandas_count(s) -> "ps.Series[int]":
return s.str.count(pat, flags)
return self._data.pandas_on_spark.transform_batch(pandas_count)
@no_type_check
def decode(self, encoding, errors="strict") -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
@no_type_check
def encode(self, encoding, errors="strict") -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
@no_type_check
def extract(self, pat, flags=0, expand=True) -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
@no_type_check
def extractall(self, pat, flags=0) -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
def find(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series":
"""
Return lowest indexes in each strings in the Series where the
substring is fully contained between [start:end].
Return -1 on failure. Equivalent to standard :func:`str.find`.
Parameters
----------
sub : str
Substring being searched.
start : int
Left edge index.
end : int
Right edge index.
Returns
-------
Series of int
Series of lowest matching indexes.
Examples
--------
>>> s = ps.Series(['apple', 'oranges', 'bananas'])
>>> s.str.find('a')
0 0
1 2
2 1
dtype: int64
>>> s.str.find('a', start=2)
0 -1
1 2
2 3
dtype: int64
>>> s.str.find('a', end=1)
0 0
1 -1
2 -1
dtype: int64
>>> s.str.find('a', start=2, end=2)
0 -1
1 -1
2 -1
dtype: int64
"""
@no_type_check
def pandas_find(s) -> "ps.Series[int]":
return s.str.find(sub, start, end)
return self._data.pandas_on_spark.transform_batch(pandas_find)
def findall(self, pat: str, flags: int = 0) -> "ps.Series":
"""
Find all occurrences of pattern or regular expression in the Series.
Equivalent to applying :func:`re.findall` to all the elements in
the Series.
Parameters
----------
pat : str
Pattern or regular expression.
flags : int, default 0 (no flags)
`re` module flags, e.g. `re.IGNORECASE`.
Returns
-------
Series of object
All non-overlapping matches of pattern or regular expression in
each string of this Series.
Examples
--------
>>> s = ps.Series(['Lion', 'Monkey', 'Rabbit'])
The search for the pattern ‘Monkey’ returns one match:
>>> s.str.findall('Monkey')
0 []
1 [Monkey]
2 []
dtype: object
On the other hand, the search for the pattern ‘MONKEY’ doesn’t return
any match:
>>> s.str.findall('MONKEY')
0 []
1 []
2 []
dtype: object
Flags can be added to the pattern or regular expression. For instance,
to find the pattern ‘MONKEY’ ignoring the case:
>>> import re
>>> s.str.findall('MONKEY', flags=re.IGNORECASE)
0 []
1 [Monkey]
2 []
dtype: object
When the pattern matches more than one string in the Series, all
matches are returned:
>>> s.str.findall('on')
0 [on]
1 [on]
2 []
dtype: object
Regular expressions are supported too. For instance, the search for all
the strings ending with the word ‘on’ is shown next:
>>> s.str.findall('on$')
0 [on]
1 []
2 []
dtype: object
If the pattern is found more than once in the same string, then a list
of multiple strings is returned:
>>> s.str.findall('b')
0 []
1 []
2 [b, b]
dtype: object
"""
# type hint does not support to specify array type yet.
@pandas_udf(returnType=ArrayType(StringType(), containsNull=True)) # type: ignore
def pudf(s: pd.Series) -> pd.Series:
return s.str.findall(pat, flags)
return self._data._with_new_scol(scol=pudf(self._data.spark.column))
def index(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series":
"""
Return lowest indexes in each strings where the substring is fully
contained between [start:end].
This is the same as :func:`str.find` except instead of returning -1,
it raises a ValueError when the substring is not found. Equivalent to
standard :func:`str.index`.
Parameters
----------
sub : str
Substring being searched.
start : int
Left edge index.
end : int
Right edge index.
Returns
-------
Series of int
Series of lowest matching indexes.
Examples
--------
>>> s = ps.Series(['apple', 'oranges', 'bananas'])
>>> s.str.index('a')
0 0
1 2
2 1
dtype: int64
The following expression throws an exception:
>>> s.str.index('a', start=2) # doctest: +SKIP
"""
@no_type_check
def pandas_index(s) -> "ps.Series[np.int64]":
return s.str.index(sub, start, end)
return self._data.pandas_on_spark.transform_batch(pandas_index)
def join(self, sep: str) -> "ps.Series":
"""
Join lists contained as elements in the Series with passed delimiter.
If the elements of a Series are lists themselves, join the content of
these lists using the delimiter passed to the function. This function
is an equivalent to calling :func:`str.join` on the lists.
Parameters
----------
sep : str
Delimiter to use between list entries.
Returns
-------
Series of object
Series with list entries concatenated by intervening occurrences of
the delimiter.
See Also
--------
str.split : Split strings around given separator/delimiter.
str.rsplit : Splits string around given separator/delimiter,
starting from the right.
Examples
--------
Example with a list that contains a None element.
>>> s = ps.Series([['lion', 'elephant', 'zebra'],
... ['cat', None, 'dog']])
>>> s
0 [lion, elephant, zebra]
1 [cat, None, dog]
dtype: object
Join all lists using a ‘-‘. The list containing None will produce None.
>>> s.str.join('-')
0 lion-elephant-zebra
1 None
dtype: object
"""
@no_type_check
def pandas_join(s) -> "ps.Series[str]":
return s.str.join(sep)
return self._data.pandas_on_spark.transform_batch(pandas_join)
def len(self) -> "ps.Series":
"""
Computes the length of each element in the Series.
The element may be a sequence (such as a string, tuple or list).
Returns
-------
Series of int
A Series of integer values indicating the length of each element in
the Series.
Examples
--------
Returns the length (number of characters) in a string. Returns the
number of entries for lists or tuples.
>>> s1 = ps.Series(['dog', 'monkey'])
>>> s1.str.len()
0 3
1 6
dtype: int64
>>> s2 = ps.Series([["a", "b", "c"], []])
>>> s2.str.len()
0 3
1 0
dtype: int64
"""
if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
return self._data.spark.transform(lambda c: F.size(c).cast(LongType()))
else:
return self._data.spark.transform(lambda c: F.length(c).cast(LongType()))
def ljust(self, width: int, fillchar: str = " ") -> "ps.Series":
"""
Filling right side of strings in the Series with an additional
character. Equivalent to :func:`str.ljust`.
Parameters
----------
width : int
Minimum width of resulting string; additional characters will be
filled with `fillchar`.
fillchar : str
Additional character for filling, default is whitespace.
Returns
-------
Series of object
Examples
--------
>>> s = ps.Series(["caribou", "tiger"])
>>> s
0 caribou
1 tiger
dtype: object
>>> s.str.ljust(width=10, fillchar='-')
0 caribou---
1 tiger-----
dtype: object
"""
@no_type_check
def pandas_ljust(s) -> "ps.Series[str]":
return s.str.ljust(width, fillchar)
return self._data.pandas_on_spark.transform_batch(pandas_ljust)
def match(self, pat: str, case: bool = True, flags: int = 0, na: Any = np.NaN) -> "ps.Series":
"""
Determine if each string matches a regular expression.
Analogous to :func:`contains`, but more strict, relying on
:func:`re.match` instead of :func:`re.search`.
Parameters
----------
pat : str
Character sequence or regular expression.
case : bool, default True
If True, case sensitive.
flags : int, default 0 (no flags)
Flags to pass through to the re module, e.g. re.IGNORECASE.
na : default NaN
Fill value for missing values.
Returns
-------
Series of boolean values or object
A Series of boolean values indicating whether the given pattern can
be matched in the string of each element of the Series.
Examples
--------
>>> s = ps.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
>>> s.str.match('dog')
0 False
1 True
2 False
3 False
4 None
dtype: object
>>> s.str.match('mouse|dog', case=False)
0 True
1 True
2 False
3 False
4 None
dtype: object
>>> s.str.match('.+and.+', na=True)
0 False
1 False
2 True
3 False
4 True
dtype: bool
>>> import re
>>> s.str.match('MOUSE', flags=re.IGNORECASE)
0 True
1 False
2 False
3 False
4 None
dtype: object
"""
@no_type_check
def pandas_match(s) -> "ps.Series[bool]":
return s.str.match(pat, case, flags, na)
return self._data.pandas_on_spark.transform_batch(pandas_match)
def normalize(self, form: str) -> "ps.Series":
"""
Return the Unicode normal form for the strings in the Series.
For more information on the forms, see the
:func:`unicodedata.normalize`.
Parameters
----------
form : {‘NFC’, ‘NFKC’, ‘NFD’, ‘NFKD’}
Unicode form.
Returns
-------
Series of objects
A Series of normalized strings.
"""
@no_type_check
def pandas_normalize(s) -> "ps.Series[str]":
return s.str.normalize(form)
return self._data.pandas_on_spark.transform_batch(pandas_normalize)
def pad(self, width: int, side: str = "left", fillchar: str = " ") -> "ps.Series":
"""
Pad strings in the Series up to width.
Parameters
----------
width : int
Minimum width of resulting string; additional characters will be
filled with character defined in `fillchar`.
side : {‘left’, ‘right’, ‘both’}, default ‘left’
Side from which to fill resulting string.
fillchar : str, default ' '
Additional character for filling, default is whitespace.
Returns
-------
Series of object
Returns Series with minimum number of char in object.
Examples
--------
>>> s = ps.Series(["caribou", "tiger"])
>>> s
0 caribou
1 tiger
dtype: object
>>> s.str.pad(width=10)
0 caribou
1 tiger
dtype: object
>>> s.str.pad(width=10, side='right', fillchar='-')
0 caribou---
1 tiger-----
dtype: object
>>> s.str.pad(width=10, side='both', fillchar='-')
0 -caribou--
1 --tiger---
dtype: object
"""
@no_type_check
def pandas_pad(s) -> "ps.Series[str]":
return s.str.pad(width, side, fillchar)
return self._data.pandas_on_spark.transform_batch(pandas_pad)
def partition(self, sep: str = " ", expand: bool = True) -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
def repeat(self, repeats: int) -> "ps.Series":
"""
Duplicate each string in the Series.
Parameters
----------
repeats : int
Repeat the string given number of times (int). Sequence of int
is not supported.
Returns
-------
Series of object
Series or Index of repeated string objects specified by input
parameter repeats.
Examples
--------
>>> s = ps.Series(['a', 'b', 'c'])
>>> s
0 a
1 b
2 c
dtype: object
Single int repeats string in Series
>>> s.str.repeat(repeats=2)
0 aa
1 bb
2 cc
dtype: object
"""
if not isinstance(repeats, int):
raise TypeError("repeats expects an int parameter")
return self._data.spark.transform(lambda c: SF.repeat(col=c, n=repeats))
def replace(
self,
pat: str,
repl: Union[str, Callable[[str], str]],
n: int = -1,
case: Optional[bool] = None,
flags: int = 0,
regex: bool = True,
) -> "ps.Series":
"""
Replace occurrences of pattern/regex in the Series with some other
string. Equivalent to :func:`str.replace` or :func:`re.sub`.
Parameters
----------
pat : str or compiled regex
String can be a character sequence or regular expression.
repl : str or callable
Replacement string or a callable. The callable is passed the regex
match object and must return a replacement string to be used. See
:func:`re.sub`.
n : int, default -1 (all)
Number of replacements to make from start.
case : boolean, default None
If True, case sensitive (the default if pat is a string).
Set to False for case insensitive.
Cannot be set if pat is a compiled regex.
flags: int, default 0 (no flags)
re module flags, e.g. re.IGNORECASE.
Cannot be set if pat is a compiled regex.
regex : boolean, default True
If True, assumes the passed-in pattern is a regular expression.
If False, treats the pattern as a literal string.
Cannot be set to False if pat is a compile regex or repl is a
callable.
Returns
-------
Series of object
A copy of the string with all matching occurrences of pat replaced
by repl.
Examples
--------
When pat is a string and regex is True (the default), the given pat is
compiled as a regex. When repl is a string, it replaces matching regex
patterns as with :func:`re.sub`. NaN value(s) in the Series are changed
to None:
>>> ps.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
0 bao
1 baz
2 None
dtype: object
When pat is a string and regex is False, every pat is replaced with
repl as with :func:`str.replace`:
>>> ps.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
0 bao
1 fuz
2 None
dtype: object
When repl is a callable, it is called on every pat using
:func:`re.sub`. The callable should expect one positional argument (a
regex object) and return a string.
Reverse every lowercase alphabetic word:
>>> repl = lambda m: m.group(0)[::-1]
>>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
0 oof 123
1 rab zab
2 None
dtype: object
Using regex groups (extract second group and swap case):
>>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
>>> repl = lambda m: m.group('two').swapcase()
>>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
0 tWO
1 bAR
dtype: object
Using a compiled regex with flags:
>>> import re
>>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
>>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
0 foo
1 bar
2 None
dtype: object
"""
@no_type_check
def pandas_replace(s) -> "ps.Series[str]":
return s.str.replace(pat, repl, n=n, case=case, flags=flags, regex=regex)
return self._data.pandas_on_spark.transform_batch(pandas_replace)
def rfind(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series":
"""
Return highest indexes in each strings in the Series where the
substring is fully contained between [start:end].
Return -1 on failure. Equivalent to standard :func:`str.rfind`.
Parameters
----------
sub : str
Substring being searched.
start : int
Left edge index.
end : int
Right edge index.
Returns
-------
Series of int
Series of highest matching indexes.
Examples
--------
>>> s = ps.Series(['apple', 'oranges', 'bananas'])
>>> s.str.rfind('a')
0 0
1 2
2 5
dtype: int64
>>> s.str.rfind('a', start=2)
0 -1
1 2
2 5
dtype: int64
>>> s.str.rfind('a', end=1)
0 0
1 -1
2 -1
dtype: int64
>>> s.str.rfind('a', start=2, end=2)
0 -1
1 -1
2 -1
dtype: int64
"""
@no_type_check
def pandas_rfind(s) -> "ps.Series[int]":
return s.str.rfind(sub, start, end)
return self._data.pandas_on_spark.transform_batch(pandas_rfind)
def rindex(self, sub: str, start: int = 0, end: Optional[int] = None) -> "ps.Series":
"""
Return highest indexes in each strings where the substring is fully
contained between [start:end].
This is the same as :func:`str.rfind` except instead of returning -1,
it raises a ValueError when the substring is not found. Equivalent to
standard :func:`str.rindex`.
Parameters
----------
sub : str
Substring being searched.
start : int
Left edge index.
end : int
Right edge index.
Returns
-------
Series of int
Series of highest matching indexes.
Examples
--------
>>> s = ps.Series(['apple', 'oranges', 'bananas'])
>>> s.str.rindex('a')
0 0
1 2
2 5
dtype: int64
The following expression throws an exception:
>>> s.str.rindex('a', start=2) # doctest: +SKIP
"""
@no_type_check
def pandas_rindex(s) -> "ps.Series[np.int64]":
return s.str.rindex(sub, start, end)
return self._data.pandas_on_spark.transform_batch(pandas_rindex)
def rjust(self, width: int, fillchar: str = " ") -> "ps.Series":
"""
Filling left side of strings in the Series with an additional
character. Equivalent to :func:`str.rjust`.
Parameters
----------
width : int
Minimum width of resulting string; additional characters will be
filled with `fillchar`.
fillchar : str
Additional character for filling, default is whitespace.
Returns
-------
Series of object
Examples
--------
>>> s = ps.Series(["caribou", "tiger"])
>>> s
0 caribou
1 tiger
dtype: object
>>> s.str.rjust(width=10)
0 caribou
1 tiger
dtype: object
>>> s.str.rjust(width=10, fillchar='-')
0 ---caribou
1 -----tiger
dtype: object
"""
@no_type_check
def pandas_rjust(s) -> "ps.Series[str]":
return s.str.rjust(width, fillchar)
return self._data.pandas_on_spark.transform_batch(pandas_rjust)
def rpartition(self, sep: str = " ", expand: bool = True) -> "ps.Series":
"""
Not supported.
"""
raise NotImplementedError()
def slice(
self, start: Optional[int] = None, stop: Optional[int] = None, step: Optional[int] = None
) -> "ps.Series":
"""
Slice substrings from each element in the Series.
Parameters
----------
start : int, optional
Start position for slice operation.
stop : int, optional
Stop position for slice operation.
step : int, optional
Step size for slice operation.
Returns
-------
Series of object
Series from sliced substrings from original string objects.
Examples
--------
>>> s = ps.Series(["koala", "fox", "chameleon"])
>>> s
0 koala
1 fox
2 chameleon
dtype: object
>>> s.str.slice(start=1)
0 oala
1 ox
2 hameleon
dtype: object
>>> s.str.slice(stop=2)
0 ko
1 fo
2 ch
dtype: object
>>> s.str.slice(step=2)
0 kaa
1 fx
2 caeen
dtype: object
>>> s.str.slice(start=0, stop=5, step=3)
0 kl
1 f
2 cm
dtype: object
"""
@no_type_check
def pandas_slice(s) -> "ps.Series[str]":
return s.str.slice(start, stop, step)
return self._data.pandas_on_spark.transform_batch(pandas_slice)
def slice_replace(
self, start: Optional[int] = None, stop: Optional[int] = None, repl: Optional[str] = None
) -> "ps.Series":
"""
Slice substrings from each element in the Series.
Parameters
----------
start : int, optional
Start position for slice operation. If not specified (None), the
slice is unbounded on the left, i.e. slice from the start of the
string.
stop : int, optional
Stop position for slice operation. If not specified (None), the
slice is unbounded on the right, i.e. slice until the end of the
string.
repl : str, optional
String for replacement. If not specified (None), the sliced region
is replaced with an empty string.
Returns
-------
Series of object
Series from sliced substrings from original string objects.
Examples
--------
>>> s = ps.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
>>> s
0 a
1 ab
2 abc
3 abdc
4 abcde
dtype: object
Specify just start, meaning replace start until the end of the string
with repl.
>>> s.str.slice_replace(1, repl='X')
0 aX
1 aX
2 aX
3 aX
4 aX
dtype: object
Specify just stop, meaning the start of the string to stop is replaced
with repl, and the rest of the string is included.
>>> s.str.slice_replace(stop=2, repl='X')
0 X
1 X
2 Xc
3 Xdc
4 Xcde
dtype: object
Specify start and stop, meaning the slice from start to stop is
replaced with repl. Everything before or after start and stop is
included as is.
>>> s.str.slice_replace(start=1, stop=3, repl='X')
0 aX
1 aX
2 aX
3 aXc
4 aXde
dtype: object
"""
@no_type_check
def pandas_slice_replace(s) -> "ps.Series[str]":
return s.str.slice_replace(start, stop, repl)
return self._data.pandas_on_spark.transform_batch(pandas_slice_replace)
def split(
self, pat: Optional[str] = None, n: int = -1, expand: bool = False
) -> Union["ps.Series", "ps.DataFrame"]:
"""
Split strings around given separator/delimiter.
Splits the string in the Series from the beginning, at the specified
delimiter string. Equivalent to :func:`str.split`.
Parameters
----------
pat : str, optional
String or regular expression to split on. If not specified, split
on whitespace.
n : int, default -1 (all)
Limit number of splits in output. None, 0 and -1 will be
interpreted as return all splits.
expand : bool, default False
Expand the splitted strings into separate columns.
* If ``True``, `n` must be a positive integer, and return DataFrame expanding
dimensionality.
* If ``False``, return Series, containing lists of strings.
Returns
-------
Series, DataFrame
Type matches caller unless `expand=True` (see Notes).
See Also
--------
str.rsplit : Splits string around given separator/delimiter,
starting from the right.
str.join : Join lists contained as elements in the Series/Index
with passed delimiter.
Notes
-----
The handling of the `n` keyword depends on the number of found splits:
- If found splits > `n`, make first `n` splits only
- If found splits <= `n`, make all splits
- If for a certain row the number of found splits < `n`,
append `None` for padding up to `n` if ``expand=True``
If using ``expand=True``, Series callers return DataFrame objects with `n + 1` columns.
.. note:: Even if `n` is much larger than found splits, the number of columns does NOT
shrink unlike pandas.
Examples
--------
>>> s = ps.Series(["this is a regular sentence",
... "https://docs.python.org/3/tutorial/index.html",
... np.nan])
In the default setting, the string is split by whitespace.
>>> s.str.split()
0 [this, is, a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
Without the n parameter, the outputs of rsplit and split are identical.
>>> s.str.rsplit()
0 [this, is, a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
The n parameter can be used to limit the number of splits on the
delimiter. The outputs of split and rsplit are different.
>>> s.str.split(n=2)
0 [this, is, a regular sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
>>> s.str.rsplit(n=2)
0 [this is a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
The pat parameter can be used to split by other characters.
>>> s.str.split(pat = "/")
0 [this is a regular sentence]
1 [https:, , docs.python.org, 3, tutorial, index...
2 None
dtype: object
When using ``expand=True``, the split elements will expand out into
separate columns. If NaN is present, it is propagated throughout
the columns during the split.
>>> s.str.split(n=4, expand=True)
0 1 2 3 4
0 this is a regular sentence
1 https://docs.python.org/3/tutorial/index.html None None None None
2 None None None None None
For slightly more complex use cases like splitting the html document name
from a url, a combination of parameter settings can be used.
>>> s.str.rsplit("/", n=1, expand=True)
0 1
0 this is a regular sentence None
1 https://docs.python.org/3/tutorial index.html
2 None None
Remember to escape special characters when explicitly using regular
expressions.
>>> s = ps.Series(["1+1=2"])
>>> s.str.split(r"\\+|=", n=2, expand=True)
0 1 2
0 1 1 2
"""
from pyspark.pandas.frame import DataFrame
if expand and n <= 0:
raise NotImplementedError("expand=True is currently only supported with n > 0.")
# type hint does not support to specify array type yet.
return_type = ArrayType(StringType(), containsNull=True)
@pandas_udf(returnType=return_type) # type: ignore
def pudf(s: pd.Series) -> pd.Series:
return s.str.split(pat, n)
psser = self._data._with_new_scol(
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
field=self._data._internal.data_fields[0].copy(spark_type=return_type, nullable=True),
)
if expand:
psdf = psser.to_frame()
scol = psdf._internal.data_spark_columns[0]
spark_columns = [scol[i].alias(str(i)) for i in range(n + 1)]
column_labels = [(i,) for i in range(n + 1)]
internal = psdf._internal.with_new_columns(
spark_columns,
column_labels=cast(Optional[List], column_labels),
data_fields=[
self._data._internal.data_fields[0].copy(name=str(i), nullable=True)
for i in range(n + 1)
],
)
return DataFrame(internal)
else:
return psser
def rsplit(
self, pat: Optional[str] = None, n: int = -1, expand: bool = False
) -> Union["ps.Series", "ps.DataFrame"]:
"""
Split strings around given separator/delimiter.
Splits the string in the Series from the end, at the specified
delimiter string. Equivalent to :func:`str.rsplit`.
Parameters
----------
pat : str, optional
String or regular expression to split on. If not specified, split
on whitespace.
n : int, default -1 (all)
Limit number of splits in output. None, 0 and -1 will be
interpreted as return all splits.
expand : bool, default False
Expand the splitted strings into separate columns.
* If ``True``, `n` must be a positive integer, and return DataFrame expanding
dimensionality.
* If ``False``, return Series, containing lists of strings.
Returns
-------
Series, DataFrame
Type matches caller unless `expand=True` (see Notes).
See Also
--------
str.split : Split strings around given separator/delimiter.
str.join : Join lists contained as elements in the Series/Index
with passed delimiter.
Notes
-----
The handling of the `n` keyword depends on the number of found splits:
- If found splits > `n`, make first `n` splits only
- If found splits <= `n`, make all splits
- If for a certain row the number of found splits < `n`,
append `None` for padding up to `n` if ``expand=True``
If using ``expand=True``, Series callers return DataFrame objects with `n + 1` columns.
.. note:: Even if `n` is much larger than found splits, the number of columns does NOT
shrink unlike pandas.
Examples
--------
>>> s = ps.Series(["this is a regular sentence",
... "https://docs.python.org/3/tutorial/index.html",
... np.nan])
In the default setting, the string is split by whitespace.
>>> s.str.split()
0 [this, is, a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
Without the n parameter, the outputs of rsplit and split are identical.
>>> s.str.rsplit()
0 [this, is, a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
The n parameter can be used to limit the number of splits on the
delimiter. The outputs of split and rsplit are different.
>>> s.str.split(n=2)
0 [this, is, a regular sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
>>> s.str.rsplit(n=2)
0 [this is a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 None
dtype: object
When using ``expand=True``, the split elements will expand out into
separate columns. If NaN is present, it is propagated throughout
the columns during the split.
>>> s.str.split(n=4, expand=True)
0 1 2 3 4
0 this is a regular sentence
1 https://docs.python.org/3/tutorial/index.html None None None None
2 None None None None None
For slightly more complex use cases like splitting the html document name
from a url, a combination of parameter settings can be used.
>>> s.str.rsplit("/", n=1, expand=True)
0 1
0 this is a regular sentence None
1 https://docs.python.org/3/tutorial index.html
2 None None
Remember to escape special characters when explicitly using regular
expressions.
>>> s = ps.Series(["1+1=2"])
>>> s.str.split(r"\\+|=", n=2, expand=True)
0 1 2
0 1 1 2
"""
from pyspark.pandas.frame import DataFrame
if expand and n <= 0:
raise NotImplementedError("expand=True is currently only supported with n > 0.")
# type hint does not support to specify array type yet.
return_type = ArrayType(StringType(), containsNull=True)
@pandas_udf(returnType=return_type) # type: ignore
def pudf(s: pd.Series) -> pd.Series:
return s.str.rsplit(pat, n)
psser = self._data._with_new_scol(
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
field=self._data._internal.data_fields[0].copy(spark_type=return_type, nullable=True),
)
if expand:
psdf = psser.to_frame()
scol = psdf._internal.data_spark_columns[0]
spark_columns = [scol[i].alias(str(i)) for i in range(n + 1)]
column_labels = [(i,) for i in range(n + 1)]
internal = psdf._internal.with_new_columns(
spark_columns,
column_labels=cast(Optional[List], column_labels),
data_fields=[
self._data._internal.data_fields[0].copy(name=str(i), nullable=True)
for i in range(n + 1)
],
)
return DataFrame(internal)
else:
return psser
def translate(self, table: Dict) -> "ps.Series":
"""
Map all characters in the string through the given mapping table.
Equivalent to standard :func:`str.translate`.
Parameters
----------
table : dict
Table is a mapping of Unicode ordinals to Unicode ordinals,
strings, or None. Unmapped characters are left untouched.
Characters mapped to None are deleted. :func:`str.maketrans` is a
helper function for making translation tables.
Returns
-------
Series of object
Series with translated strings.
Examples
--------
>>> s = ps.Series(["dog", "cat", "bird"])
>>> m = str.maketrans({'a': 'X', 'i': 'Y', 'o': None})
>>> s.str.translate(m)
0 dg
1 cXt
2 bYrd
dtype: object
"""
@no_type_check
def pandas_translate(s) -> "ps.Series[str]":
return s.str.translate(table)
return self._data.pandas_on_spark.transform_batch(pandas_translate)
def wrap(self, width: int, **kwargs: bool) -> "ps.Series":
"""
Wrap long strings in the Series to be formatted in paragraphs with
length less than a given width.
This method has the same keyword parameters and defaults as
:class:`textwrap.TextWrapper`.
Parameters
----------
width : int
Maximum line-width. Lines separated with newline char.
expand_tabs : bool, optional
If true, tab characters will be expanded to spaces (default: True).
replace_whitespace : bool, optional
If true, each whitespace character remaining after tab expansion
will be replaced by a single space (default: True).
drop_whitespace : bool, optional
If true, whitespace that, after wrapping, happens to end up at the
beginning or end of a line is dropped (default: True).
break_long_words : bool, optional
If true, then words longer than width will be broken in order to
ensure that no lines are longer than width. If it is false, long
words will not be broken, and some lines may be longer than width
(default: True).
break_on_hyphens : bool, optional
If true, wrapping will occur preferably on whitespace and right
after hyphens in compound words, as it is customary in English.
If false, only whitespaces will be considered as potentially good
places for line breaks, but you need to set break_long_words to
false if you want truly insecable words (default: True).
Returns
-------
Series of object
Series with wrapped strings.
Examples
--------
>>> s = ps.Series(['line to be wrapped', 'another line to be wrapped'])
>>> s.str.wrap(12)
0 line to be\\nwrapped
1 another line\\nto be\\nwrapped
dtype: object
"""
@no_type_check
def pandas_wrap(s) -> "ps.Series[str]":
return s.str.wrap(width, **kwargs)
return self._data.pandas_on_spark.transform_batch(pandas_wrap)
def zfill(self, width: int) -> "ps.Series":
"""
Pad strings in the Series by prepending ‘0’ characters.
Strings in the Series are padded with ‘0’ characters on the left of the
string to reach a total string length width. Strings in the Series with
length greater or equal to width are unchanged.
Differs from :func:`str.zfill` which has special handling for ‘+’/’-‘
in the string.
Parameters
----------
width : int
Minimum length of resulting string; strings with length less than
width be prepended with ‘0’ characters.
Returns
-------
Series of object
Series with '0' left-padded strings.
Examples
--------
>>> s = ps.Series(['-1', '1', '1000', np.nan])
>>> s
0 -1
1 1
2 1000
3 None
dtype: object
Note that NaN is not a string, therefore it is converted to NaN. The
minus sign in '-1' is treated as a regular character and the zero is
added to the left of it (:func:`str.zfill` would have moved it to the
left). 1000 remains unchanged as it is longer than width.
>>> s.str.zfill(3)
0 0-1
1 001
2 1000
3 None
dtype: object
"""
@no_type_check
def pandas_zfill(s) -> "ps.Series[str]":
return s.str.zfill(width)
return self._data.pandas_on_spark.transform_batch(pandas_zfill)
@no_type_check
def get_dummies(self, sep: str = "|") -> "ps.DataFrame":
"""
Not supported.
"""
raise NotImplementedError()
def _test() -> None:
import os
import doctest
import sys
from pyspark.sql import SparkSession
import pyspark.pandas.strings
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.pandas.strings.__dict__.copy()
globs["ps"] = pyspark.pandas
spark = (
SparkSession.builder.master("local[4]")
.appName("pyspark.pandas.strings tests")
.getOrCreate()
)
(failure_count, test_count) = doctest.testmod(
pyspark.pandas.strings,
globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
)
spark.stop()
if failure_count:
sys.exit(-1)
if __name__ == "__main__":
_test()