blob: 7916f43052fd3c23e3884cfa1a7aba586fa4f238 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.pandas.extensions &#8212; PySpark 3.5.5 documentation</title>
<link href="../../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" href="../../../_static/styles/pydata-sphinx-theme.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/extensions.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl">
<div id="navbar-start">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo">
</a>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
<div id="navbar-center" class="mr-auto">
<div class="navbar-center-item">
<ul id="navbar-main-elements" class="navbar-nav">
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../index.html">
Overview
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../development/index.html">
Development
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</div>
</div>
<div id="navbar-end">
<div class="navbar-end-item">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
3.5.5
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/extensions.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script>
</div>
</div>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.pandas.extensions</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">Generic</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Type</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">warnings</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas._typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">T</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas.frame</span><span class="w"> </span><span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas.indexes</span><span class="w"> </span><span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas.series</span><span class="w"> </span><span class="kn">import</span> <span class="n">Series</span>
<span class="k">class</span><span class="w"> </span><span class="nc">CachedAccessor</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">T</span><span class="p">]):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Custom property-like object.</span>
<span class="sd"> A descriptor for caching accessors:</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Namespace that accessor methods, properties, etc will be accessed under, e.g. &quot;foo&quot; for a</span>
<span class="sd"> dataframe accessor yields the accessor ``df.foo``</span>
<span class="sd"> accessor: cls</span>
<span class="sd"> Class with the extension methods.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> For accessor, the class&#39;s __init__ method assumes that you are registering an accessor for one</span>
<span class="sd"> of ``Series``, ``DataFrame``, or ``Index``.</span>
<span class="sd"> This object is not meant to be instantiated directly. Instead, use register_dataframe_accessor,</span>
<span class="sd"> register_series_accessor, or register_index_accessor.</span>
<span class="sd"> The pandas-on-Spark accessor is modified based on pandas.core.accessor.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">accessor</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_name</span> <span class="o">=</span> <span class="n">name</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_accessor</span> <span class="o">=</span> <span class="n">accessor</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__get__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">obj</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="s2">&quot;Index&quot;</span><span class="p">]],</span> <span class="bp">cls</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">T</span><span class="p">,</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span>
<span class="k">if</span> <span class="n">obj</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_accessor</span>
<span class="n">accessor_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_accessor</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span> <span class="c1"># type: ignore[call-arg]</span>
<span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_name</span><span class="p">,</span> <span class="n">accessor_obj</span><span class="p">)</span>
<span class="k">return</span> <span class="n">accessor_obj</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_register_accessor</span><span class="p">(</span>
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="bp">cls</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="s2">&quot;DataFrame&quot;</span><span class="p">],</span> <span class="n">Type</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">],</span> <span class="n">Type</span><span class="p">[</span><span class="s2">&quot;Index&quot;</span><span class="p">]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Register a custom accessor on {klass} objects.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Name under which the accessor should be registered. A warning is issued if this name</span>
<span class="sd"> conflicts with a preexisting attribute.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> callable</span>
<span class="sd"> A class decorator.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> register_dataframe_accessor: Register a custom accessor on DataFrame objects</span>
<span class="sd"> register_series_accessor: Register a custom accessor on Series objects</span>
<span class="sd"> register_index_accessor: Register a custom accessor on Index objects</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> When accessed, your accessor will be initialized with the pandas-on-Spark object the user</span>
<span class="sd"> is interacting with. The code signature must be:</span>
<span class="sd"> .. code-block:: python</span>
<span class="sd"> def __init__(self, pandas_on_spark_obj):</span>
<span class="sd"> # constructor logic</span>
<span class="sd"> ...</span>
<span class="sd"> In the pandas API, if data passed to your accessor has an incorrect dtype, it&#39;s recommended to</span>
<span class="sd"> raise an ``AttributeError`` for consistency purposes. In pandas-on-Spark, ``ValueError`` is more</span>
<span class="sd"> frequently used to annotate when a value&#39;s datatype is unexpected for a given method/function.</span>
<span class="sd"> Ultimately, you can structure this however you like, but pandas-on-Spark would likely do</span>
<span class="sd"> something like this:</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, &#39;b&#39;]).dt</span>
<span class="sd"> ...</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: Cannot call DatetimeMethods on type StringType()</span>
<span class="sd"> Note: This function is not meant to be used directly - instead, use register_dataframe_accessor,</span>
<span class="sd"> register_series_accessor, or register_index_accessor.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="nf">decorator</span><span class="p">(</span><span class="n">accessor</span><span class="p">:</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
<span class="n">msg</span> <span class="o">=</span> <span class="p">(</span>
<span class="s2">&quot;registration of accessor </span><span class="si">{0}</span><span class="s2"> under name &#39;</span><span class="si">{1}</span><span class="s2">&#39; for type </span><span class="si">{2}</span><span class="s2"> is overriding &quot;</span>
<span class="s2">&quot;a preexisting attribute with the same name.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">accessor</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="bp">cls</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="n">msg</span><span class="p">,</span>
<span class="ne">UserWarning</span><span class="p">,</span>
<span class="n">stacklevel</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
<span class="p">)</span>
<span class="nb">setattr</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">name</span><span class="p">,</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">accessor</span><span class="p">))</span>
<span class="k">return</span> <span class="n">accessor</span>
<span class="k">return</span> <span class="n">decorator</span>
<div class="viewcode-block" id="register_dataframe_accessor"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.extensions.register_dataframe_accessor.html#pyspark.pandas.extensions.register_dataframe_accessor">[docs]</a><span class="k">def</span><span class="w"> </span><span class="nf">register_dataframe_accessor</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Register a custom accessor with a DataFrame</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> name used when calling the accessor after its registered</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> callable</span>
<span class="sd"> A class decorator.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> register_series_accessor: Register a custom accessor on Series objects</span>
<span class="sd"> register_index_accessor: Register a custom accessor on Index objects</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> When accessed, your accessor will be initialized with the pandas-on-Spark object the user</span>
<span class="sd"> is interacting with. The accessor&#39;s init method should always ingest the object being accessed.</span>
<span class="sd"> See the examples for the init signature.</span>
<span class="sd"> In the pandas API, if data passed to your accessor has an incorrect dtype, it&#39;s recommended to</span>
<span class="sd"> raise an ``AttributeError`` for consistency purposes. In pandas-on-Spark, ``ValueError`` is more</span>
<span class="sd"> frequently used to annotate when a value&#39;s datatype is unexpected for a given method/function.</span>
<span class="sd"> Ultimately, you can structure this however you like, but pandas-on-Spark would likely do</span>
<span class="sd"> something like this:</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, &#39;b&#39;]).dt</span>
<span class="sd"> ...</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: Cannot call DatetimeMethods on type StringType()</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> In your library code::</span>
<span class="sd"> from pyspark.pandas.extensions import register_dataframe_accessor</span>
<span class="sd"> @register_dataframe_accessor(&quot;geo&quot;)</span>
<span class="sd"> class GeoAccessor:</span>
<span class="sd"> def __init__(self, pandas_on_spark_obj):</span>
<span class="sd"> self._obj = pandas_on_spark_obj</span>
<span class="sd"> # other constructor logic</span>
<span class="sd"> @property</span>
<span class="sd"> def center(self):</span>
<span class="sd"> # return the geographic center point of this DataFrame</span>
<span class="sd"> lat = self._obj.latitude</span>
<span class="sd"> lon = self._obj.longitude</span>
<span class="sd"> return (float(lon.mean()), float(lat.mean()))</span>
<span class="sd"> def plot(self):</span>
<span class="sd"> # plot this array&#39;s data on a map</span>
<span class="sd"> pass</span>
<span class="sd"> Then, in an ipython session::</span>
<span class="sd"> &gt;&gt;&gt; ## Import if the accessor is in the other file.</span>
<span class="sd"> &gt;&gt;&gt; # from my_ext_lib import GeoAccessor</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&quot;longitude&quot;: np.linspace(0,10),</span>
<span class="sd"> ... &quot;latitude&quot;: np.linspace(0, 20)})</span>
<span class="sd"> &gt;&gt;&gt; psdf.geo.center # doctest: +SKIP</span>
<span class="sd"> (5.0, 10.0)</span>
<span class="sd"> &gt;&gt;&gt; psdf.geo.plot() # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas</span><span class="w"> </span><span class="kn">import</span> <span class="n">DataFrame</span>
<span class="k">return</span> <span class="n">_register_accessor</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span></div>
<div class="viewcode-block" id="register_series_accessor"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.extensions.register_series_accessor.html#pyspark.pandas.extensions.register_series_accessor">[docs]</a><span class="k">def</span><span class="w"> </span><span class="nf">register_series_accessor</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Register a custom accessor with a Series object</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> name used when calling the accessor after its registered</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> callable</span>
<span class="sd"> A class decorator.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> register_dataframe_accessor: Register a custom accessor on DataFrame objects</span>
<span class="sd"> register_index_accessor: Register a custom accessor on Index objects</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> When accessed, your accessor will be initialized with the pandas-on-Spark object the user is</span>
<span class="sd"> interacting with. The code signature must be::</span>
<span class="sd"> def __init__(self, pandas_on_spark_obj):</span>
<span class="sd"> # constructor logic</span>
<span class="sd"> ...</span>
<span class="sd"> In the pandas API, if data passed to your accessor has an incorrect dtype, it&#39;s recommended to</span>
<span class="sd"> raise an ``AttributeError`` for consistency purposes. In pandas-on-Spark, ``ValueError`` is more</span>
<span class="sd"> frequently used to annotate when a value&#39;s datatype is unexpected for a given method/function.</span>
<span class="sd"> Ultimately, you can structure this however you like, but pandas-on-Spark would likely do</span>
<span class="sd"> something like this:</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, &#39;b&#39;]).dt</span>
<span class="sd"> ...</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: Cannot call DatetimeMethods on type StringType()</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> In your library code::</span>
<span class="sd"> from pyspark.pandas.extensions import register_series_accessor</span>
<span class="sd"> @register_series_accessor(&quot;geo&quot;)</span>
<span class="sd"> class GeoAccessor:</span>
<span class="sd"> def __init__(self, pandas_on_spark_obj):</span>
<span class="sd"> self._obj = pandas_on_spark_obj</span>
<span class="sd"> @property</span>
<span class="sd"> def is_valid(self):</span>
<span class="sd"> # boolean check to see if series contains valid geometry</span>
<span class="sd"> return True</span>
<span class="sd"> Then, in an ipython session::</span>
<span class="sd"> &gt;&gt;&gt; ## Import if the accessor is in the other file.</span>
<span class="sd"> &gt;&gt;&gt; # from my_ext_lib import GeoAccessor</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&quot;longitude&quot;: np.linspace(0,10),</span>
<span class="sd"> ... &quot;latitude&quot;: np.linspace(0, 20)})</span>
<span class="sd"> &gt;&gt;&gt; psdf.longitude.geo.is_valid # doctest: +SKIP</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas</span><span class="w"> </span><span class="kn">import</span> <span class="n">Series</span>
<span class="k">return</span> <span class="n">_register_accessor</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span></div>
<div class="viewcode-block" id="register_index_accessor"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.extensions.register_index_accessor.html#pyspark.pandas.extensions.register_index_accessor">[docs]</a><span class="k">def</span><span class="w"> </span><span class="nf">register_index_accessor</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]],</span> <span class="n">Type</span><span class="p">[</span><span class="n">T</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Register a custom accessor with an Index</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> name used when calling the accessor after its registered</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> callable</span>
<span class="sd"> A class decorator.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> register_dataframe_accessor: Register a custom accessor on DataFrame objects</span>
<span class="sd"> register_series_accessor: Register a custom accessor on Series objects</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> When accessed, your accessor will be initialized with the pandas-on-Spark object the user is</span>
<span class="sd"> interacting with. The code signature must be::</span>
<span class="sd"> def __init__(self, pandas_on_spark_obj):</span>
<span class="sd"> # constructor logic</span>
<span class="sd"> ...</span>
<span class="sd"> In the pandas API, if data passed to your accessor has an incorrect dtype, it&#39;s recommended to</span>
<span class="sd"> raise an ``AttributeError`` for consistency purposes. In pandas-on-Spark, ``ValueError`` is more</span>
<span class="sd"> frequently used to annotate when a value&#39;s datatype is unexpected for a given method/function.</span>
<span class="sd"> Ultimately, you can structure this however you like, but pandas-on-Spark would likely do</span>
<span class="sd"> something like this:</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, &#39;b&#39;]).dt</span>
<span class="sd"> ...</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: Cannot call DatetimeMethods on type StringType()</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> In your library code::</span>
<span class="sd"> from pyspark.pandas.extensions import register_index_accessor</span>
<span class="sd"> @register_index_accessor(&quot;foo&quot;)</span>
<span class="sd"> class CustomAccessor:</span>
<span class="sd"> def __init__(self, pandas_on_spark_obj):</span>
<span class="sd"> self._obj = pandas_on_spark_obj</span>
<span class="sd"> self.item = &quot;baz&quot;</span>
<span class="sd"> @property</span>
<span class="sd"> def bar(self):</span>
<span class="sd"> # return item value</span>
<span class="sd"> return self.item</span>
<span class="sd"> Then, in an ipython session::</span>
<span class="sd"> &gt;&gt;&gt; ## Import if the accessor is in the other file.</span>
<span class="sd"> &gt;&gt;&gt; # from my_ext_lib import CustomAccessor</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&quot;longitude&quot;: np.linspace(0,10),</span>
<span class="sd"> ... &quot;latitude&quot;: np.linspace(0, 20)})</span>
<span class="sd"> &gt;&gt;&gt; psdf.index.foo.bar # doctest: +SKIP</span>
<span class="sd"> &#39;baz&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.pandas</span><span class="w"> </span><span class="kn">import</span> <span class="n">Index</span>
<span class="k">return</span> <span class="n">_register_accessor</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span></div>
<span class="k">def</span><span class="w"> </span><span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">doctest</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">pyspark.pandas.extensions</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">extensions</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;np&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">numpy</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.extensions tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">extensions</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
</div>
</main>
</div>
</div>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright .<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br>
</p>
</div>
</div>
</footer>
</body>
</html>