blob: 88aaef01293519bcb92d6d5c1409dbb453766ad2 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.pandas.generic &#8212; PySpark 3.3.4 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/generic.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.pandas.generic</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">A base class of DataFrame/Column to behave similar to pandas DataFrame/Series.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">Counter</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">Iterable</span><span class="p">,</span>
<span class="n">IO</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">NoReturn</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">IntegralType</span><span class="p">,</span>
<span class="n">LongType</span><span class="p">,</span>
<span class="n">NumericType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Axis</span><span class="p">,</span>
<span class="n">DataFrameOrSeries</span><span class="p">,</span>
<span class="n">Dtype</span><span class="p">,</span>
<span class="n">FrameLike</span><span class="p">,</span>
<span class="n">Label</span><span class="p">,</span>
<span class="n">Name</span><span class="p">,</span>
<span class="n">Scalar</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexing</span> <span class="kn">import</span> <span class="n">AtIndexer</span><span class="p">,</span> <span class="n">iAtIndexer</span><span class="p">,</span> <span class="n">iLocIndexer</span><span class="p">,</span> <span class="n">LocIndexer</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="n">InternalFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">spark_type_to_pandas_dtype</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">is_name_like_tuple</span><span class="p">,</span>
<span class="n">is_name_like_value</span><span class="p">,</span>
<span class="n">name_like_string</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">sql_conf</span><span class="p">,</span>
<span class="n">validate_arguments_and_invoke_function</span><span class="p">,</span>
<span class="n">validate_axis</span><span class="p">,</span>
<span class="n">validate_mode</span><span class="p">,</span>
<span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">,</span>
<span class="n">log_advice</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.base</span> <span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">GroupBy</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Rolling</span><span class="p">,</span> <span class="n">Expanding</span>
<span class="n">bool_type</span> <span class="o">=</span> <span class="nb">bool</span>
<span class="k">class</span> <span class="nc">Frame</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The base class for both DataFrame and Series.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;Series&quot;</span><span class="p">],</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Column</span><span class="p">]],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;Series&quot;</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Scalar</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Index&quot;</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_to_internal_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative minimum over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative minimum.</span>
<span class="sd"> .. note:: the current implementation of cummin uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.min : Return the minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span>
<span class="sd"> Series.min : Return the minimum over Series axis.</span>
<span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> By default, iterates over rows and finds the minimum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cummin()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 2.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.A.cummin()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 1.0</span>
<span class="sd"> Name: A, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative maximum over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative maximum.</span>
<span class="sd"> .. note:: the current implementation of cummax uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.max : Return the maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumprod : Return cumulative product over DataFrame axis.</span>
<span class="sd"> Series.max : Return the maximum over Series axis.</span>
<span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> By default, iterates over rows and finds the maximum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cummax()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 3.0 1.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.cummax()</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 NaN</span>
<span class="sd"> 2 1.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative sum over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative sum.</span>
<span class="sd"> .. note:: the current implementation of cumsum uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.sum : Return the sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumprod : Return cumulative product over DataFrame axis.</span>
<span class="sd"> Series.sum : Return the sum over Series axis.</span>
<span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cumsum()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 5.0 NaN</span>
<span class="sd"> 2 6.0 1.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.A.cumsum()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 5.0</span>
<span class="sd"> 2 6.0</span>
<span class="sd"> Name: A, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="c1"># TODO: use pandas_udf to support negative values and other options later</span>
<span class="c1"># other window except unbounded ones is supported as of Spark 3.0.</span>
<span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative product over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative product.</span>
<span class="sd"> .. note:: the current implementation of cumprod uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> .. note:: unlike pandas&#39;, pandas-on-Spark&#39;s emulates cumulative product by</span>
<span class="sd"> ``exp(sum(log(...)))`` trick. Therefore, it only works for positive numbers.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumprod : Return cumulative product over DataFrame axis.</span>
<span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span>
<span class="sd"> Raises</span>
<span class="sd"> ------</span>
<span class="sd"> Exception : If the values is equal to or lower than 0.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [4.0, 10.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 4.0 10.0</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cumprod()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 6.0 NaN</span>
<span class="sd"> 2 24.0 10.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.A.cumprod()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 6.0</span>
<span class="sd"> 2 24.0</span>
<span class="sd"> Name: A, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: Although this has removed pandas &gt;= 1.0.0, but we&#39;re keeping this as deprecated</span>
<span class="c1"># since we&#39;re using this for `DataFrame.info` internally.</span>
<span class="c1"># We can drop it once our minimal pandas version becomes 1.0.0.</span>
<span class="k">def</span> <span class="nf">get_dtype_counts</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return counts of unique dtypes in this object.</span>
<span class="sd"> .. deprecated:: 0.14.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dtype : pd.Series</span>
<span class="sd"> Series with the count of columns with each dtype.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> dtypes : Return the dtypes in this object.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; a = [[&#39;a&#39;, 1, 1], [&#39;b&#39;, 2, 2], [&#39;c&#39;, 3, 3]]</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(a, columns=[&#39;str&#39;, &#39;int1&#39;, &#39;int2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> str int1 int2</span>
<span class="sd"> 0 a 1 1</span>
<span class="sd"> 1 b 2 2</span>
<span class="sd"> 2 c 3 3</span>
<span class="sd"> &gt;&gt;&gt; df.get_dtype_counts().sort_values()</span>
<span class="sd"> object 1</span>
<span class="sd"> int64 2</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.str.get_dtype_counts().sort_values()</span>
<span class="sd"> object 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`get_dtype_counts` has been deprecated and will be &quot;</span>
<span class="s2">&quot;removed in a future version. For DataFrames use &quot;</span>
<span class="s2">&quot;`.dtypes.value_counts()&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">):</span>
<span class="n">dtypes</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">dtypes</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="n">Counter</span><span class="p">([</span><span class="n">d</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">d</span> <span class="ow">in</span> <span class="n">dtypes</span><span class="p">])))</span>
<span class="k">def</span> <span class="nf">pipe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply func(self, \*args, \*\*kwargs).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : function</span>
<span class="sd"> function to apply to the DataFrame.</span>
<span class="sd"> ``args``, and ``kwargs`` are passed into ``func``.</span>
<span class="sd"> Alternatively a ``(callable, data_keyword)`` tuple where</span>
<span class="sd"> ``data_keyword`` is a string indicating the keyword of</span>
<span class="sd"> ``callable`` that expects the DataFrames.</span>
<span class="sd"> args : iterable, optional</span>
<span class="sd"> positional arguments passed into ``func``.</span>
<span class="sd"> kwargs : mapping, optional</span>
<span class="sd"> a dictionary of keyword arguments passed into ``func``.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> object : the return type of ``func``.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Use ``.pipe`` when chaining together functions that expect</span>
<span class="sd"> Series, DataFrames or GroupBy objects. For example, given</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;category&#39;: [&#39;A&#39;, &#39;A&#39;, &#39;B&#39;],</span>
<span class="sd"> ... &#39;col1&#39;: [1, 2, 3],</span>
<span class="sd"> ... &#39;col2&#39;: [4, 5, 6]},</span>
<span class="sd"> ... columns=[&#39;category&#39;, &#39;col1&#39;, &#39;col2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; def keep_category_a(df):</span>
<span class="sd"> ... return df[df[&#39;category&#39;] == &#39;A&#39;]</span>
<span class="sd"> &gt;&gt;&gt; def add_one(df, column):</span>
<span class="sd"> ... return df.assign(col3=df[column] + 1)</span>
<span class="sd"> &gt;&gt;&gt; def multiply(df, column1, column2):</span>
<span class="sd"> ... return df.assign(col4=df[column1] * df[column2])</span>
<span class="sd"> instead of writing</span>
<span class="sd"> &gt;&gt;&gt; multiply(add_one(keep_category_a(df), column=&quot;col1&quot;), column1=&quot;col2&quot;, column2=&quot;col3&quot;)</span>
<span class="sd"> category col1 col2 col3 col4</span>
<span class="sd"> 0 A 1 4 2 8</span>
<span class="sd"> 1 A 2 5 3 15</span>
<span class="sd"> You can write</span>
<span class="sd"> &gt;&gt;&gt; (df.pipe(keep_category_a)</span>
<span class="sd"> ... .pipe(add_one, column=&quot;col1&quot;)</span>
<span class="sd"> ... .pipe(multiply, column1=&quot;col2&quot;, column2=&quot;col3&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> category col1 col2 col3 col4</span>
<span class="sd"> 0 A 1 4 2 8</span>
<span class="sd"> 1 A 2 5 3 15</span>
<span class="sd"> If you have a function that takes the data as (say) the second</span>
<span class="sd"> argument, pass a tuple indicating which keyword expects the</span>
<span class="sd"> data. For example, suppose ``f`` takes its data as ``df``:</span>
<span class="sd"> &gt;&gt;&gt; def multiply_2(column1, df, column2):</span>
<span class="sd"> ... return df.assign(col4=df[column1] * df[column2])</span>
<span class="sd"> Then you can write</span>
<span class="sd"> &gt;&gt;&gt; (df.pipe(keep_category_a)</span>
<span class="sd"> ... .pipe(add_one, column=&quot;col1&quot;)</span>
<span class="sd"> ... .pipe((multiply_2, &#39;df&#39;), column1=&quot;col2&quot;, column2=&quot;col3&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> category col1 col2 col3 col4</span>
<span class="sd"> 0 A 1 4 2 8</span>
<span class="sd"> 1 A 2 5 3 15</span>
<span class="sd"> You can use lambda as wel</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).pipe(lambda x: (x + 1).rename(&quot;value&quot;))</span>
<span class="sd"> 0 2</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 4</span>
<span class="sd"> Name: value, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">func</span><span class="p">,</span> <span class="n">target</span> <span class="o">=</span> <span class="n">func</span>
<span class="k">if</span> <span class="n">target</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> is both the pipe target and a keyword &quot;</span> <span class="s2">&quot;argument&quot;</span> <span class="o">%</span> <span class="n">target</span><span class="p">)</span>
<span class="n">kwargs</span><span class="p">[</span><span class="n">target</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">to_numpy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A NumPy ndarray representing the values in this DataFrame or Series.</span>
<span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> numpy.ndarray</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&quot;A&quot;: [1, 2], &quot;B&quot;: [3, 4]}).to_numpy()</span>
<span class="sd"> array([[1, 3],</span>
<span class="sd"> [2, 4]])</span>
<span class="sd"> With heterogeneous data, the lowest common type will have to be used.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&quot;A&quot;: [1, 2], &quot;B&quot;: [3.0, 4.5]}).to_numpy()</span>
<span class="sd"> array([[1. , 3. ],</span>
<span class="sd"> [2. , 4.5]])</span>
<span class="sd"> For a mix of numeric and non-numeric types, the output array will have object dtype.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2], &quot;B&quot;: [3.0, 4.5], &quot;C&quot;: pd.date_range(&#39;2000&#39;, periods=2)})</span>
<span class="sd"> &gt;&gt;&gt; df.to_numpy()</span>
<span class="sd"> array([[1, 3.0, Timestamp(&#39;2000-01-01 00:00:00&#39;)],</span>
<span class="sd"> [2, 4.5, Timestamp(&#39;2000-01-02 00:00:00&#39;)]], dtype=object)</span>
<span class="sd"> For Series,</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, &#39;b&#39;, &#39;a&#39;]).to_numpy()</span>
<span class="sd"> array([&#39;a&#39;, &#39;b&#39;, &#39;a&#39;], dtype=object)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_numpy` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting NumPy ndarray is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">values</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">values</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a Numpy representation of the DataFrame or the Series.</span>
<span class="sd"> .. warning:: We recommend using `DataFrame.to_numpy()` or `Series.to_numpy()` instead.</span>
<span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> numpy.ndarray</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> A DataFrame where all columns are the same type (e.g., int64) results in an array of</span>
<span class="sd"> the same type.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;age&#39;: [ 3, 29],</span>
<span class="sd"> ... &#39;height&#39;: [94, 170],</span>
<span class="sd"> ... &#39;weight&#39;: [31, 115]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> age height weight</span>
<span class="sd"> 0 3 94 31</span>
<span class="sd"> 1 29 170 115</span>
<span class="sd"> &gt;&gt;&gt; df.dtypes</span>
<span class="sd"> age int64</span>
<span class="sd"> height int64</span>
<span class="sd"> weight int64</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; df.values</span>
<span class="sd"> array([[ 3, 94, 31],</span>
<span class="sd"> [ 29, 170, 115]])</span>
<span class="sd"> A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray</span>
<span class="sd"> of the broadest type that accommodates these mixed types (e.g., object).</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame([(&#39;parrot&#39;, 24.0, &#39;second&#39;),</span>
<span class="sd"> ... (&#39;lion&#39;, 80.5, &#39;first&#39;),</span>
<span class="sd"> ... (&#39;monkey&#39;, np.nan, None)],</span>
<span class="sd"> ... columns=(&#39;name&#39;, &#39;max_speed&#39;, &#39;rank&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df2.dtypes</span>
<span class="sd"> name object</span>
<span class="sd"> max_speed float64</span>
<span class="sd"> rank object</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; df2.values</span>
<span class="sd"> array([[&#39;parrot&#39;, 24.0, &#39;second&#39;],</span>
<span class="sd"> [&#39;lion&#39;, 80.5, &#39;first&#39;],</span>
<span class="sd"> [&#39;monkey&#39;, nan, None]], dtype=object)</span>
<span class="sd"> For Series,</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).values</span>
<span class="sd"> array([1, 2, 3])</span>
<span class="sd"> &gt;&gt;&gt; ps.Series(list(&#39;aabc&#39;)).values</span>
<span class="sd"> array([&#39;a&#39;, &#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=object)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;We recommend using `</span><span class="si">{}</span><span class="s2">.to_numpy()` instead.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_numpy</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">to_csv</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="p">,</span>
<span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">quotechar</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;&quot;&#39;</span><span class="p">,</span>
<span class="n">date_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">escapechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">num_files</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;w&quot;</span><span class="p">,</span>
<span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write object to a comma-separated values (csv) file.</span>
<span class="sd"> .. note:: pandas-on-Spark `to_csv` writes files to a path or URI. Unlike pandas&#39;,</span>
<span class="sd"> pandas-on-Spark respects HDFS&#39;s property such as &#39;fs.default.name&#39;.</span>
<span class="sd"> .. note:: pandas-on-Spark writes CSV files into the directory, `path`, and writes</span>
<span class="sd"> multiple `part-...` files in the directory when `path` is specified.</span>
<span class="sd"> This behaviour was inherited from Apache Spark. The number of files can</span>
<span class="sd"> be controlled by `num_files`.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : str, default None</span>
<span class="sd"> File path. If None is provided the result is returned as a string.</span>
<span class="sd"> sep : str, default &#39;,&#39;</span>
<span class="sd"> String of length 1. Field delimiter for the output file.</span>
<span class="sd"> na_rep : str, default &#39;&#39;</span>
<span class="sd"> Missing data representation.</span>
<span class="sd"> columns : sequence, optional</span>
<span class="sd"> Columns to write.</span>
<span class="sd"> header : bool or list of str, default True</span>
<span class="sd"> Write out the column names. If a list of strings is given it is</span>
<span class="sd"> assumed to be aliases for the column names.</span>
<span class="sd"> quotechar : str, default &#39;\&quot;&#39;</span>
<span class="sd"> String of length 1. Character used to quote fields.</span>
<span class="sd"> date_format : str, default None</span>
<span class="sd"> Format string for datetime objects.</span>
<span class="sd"> escapechar : str, default None</span>
<span class="sd"> String of length 1. Character used to escape `sep` and `quotechar`</span>
<span class="sd"> when appropriate.</span>
<span class="sd"> num_files : the number of files to be written in `path` directory when</span>
<span class="sd"> this is a path.</span>
<span class="sd"> mode : str</span>
<span class="sd"> Python write mode, default &#39;w&#39;.</span>
<span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span>
<span class="sd"> Such as &#39;append&#39;, &#39;overwrite&#39;, &#39;ignore&#39;, &#39;error&#39;, &#39;errorifexists&#39;.</span>
<span class="sd"> - &#39;append&#39; (equivalent to &#39;a&#39;): Append the new data to existing data.</span>
<span class="sd"> - &#39;overwrite&#39; (equivalent to &#39;w&#39;): Overwrite existing data.</span>
<span class="sd"> - &#39;ignore&#39;: Silently ignore this operation if data already exists.</span>
<span class="sd"> - &#39;error&#39; or &#39;errorifexists&#39;: Throw an exception if data already exists.</span>
<span class="sd"> partition_cols : str or list of str, optional, default None</span>
<span class="sd"> Names of partitioning columns</span>
<span class="sd"> index_col: str or list of str, optional, default: None</span>
<span class="sd"> Column names to be used in Spark to represent pandas-on-Spark&#39;s index. The index name</span>
<span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span>
<span class="sd"> options: keyword arguments for additional options specific to PySpark.</span>
<span class="sd"> This kwargs are specific to PySpark&#39;s CSV options to pass. Check</span>
<span class="sd"> the options in PySpark&#39;s API documentation for spark.write.csv(...).</span>
<span class="sd"> It has higher priority and overwrites all other options.</span>
<span class="sd"> This parameter only works when `path` is specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str or None</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_csv</span>
<span class="sd"> DataFrame.to_delta</span>
<span class="sd"> DataFrame.to_table</span>
<span class="sd"> DataFrame.to_parquet</span>
<span class="sd"> DataFrame.to_spark_io</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(dict(</span>
<span class="sd"> ... date=list(pd.date_range(&#39;2012-1-1 12:00:00&#39;, periods=3, freq=&#39;M&#39;)),</span>
<span class="sd"> ... country=[&#39;KR&#39;, &#39;US&#39;, &#39;JP&#39;],</span>
<span class="sd"> ... code=[1, 2 ,3]), columns=[&#39;date&#39;, &#39;country&#39;, &#39;code&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date country code</span>
<span class="sd"> ... 2012-01-31 12:00:00 KR 1</span>
<span class="sd"> ... 2012-02-29 12:00:00 US 2</span>
<span class="sd"> ... 2012-03-31 12:00:00 JP 3</span>
<span class="sd"> &gt;&gt;&gt; print(df.to_csv()) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> date,country,code</span>
<span class="sd"> 2012-01-31 12:00:00,KR,1</span>
<span class="sd"> 2012-02-29 12:00:00,US,2</span>
<span class="sd"> 2012-03-31 12:00:00,JP,3</span>
<span class="sd"> &gt;&gt;&gt; df.cummax().to_csv(path=r&#39;%s/to_csv/foo.csv&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/foo.csv&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date country code</span>
<span class="sd"> ... 2012-01-31 12:00:00 KR 1</span>
<span class="sd"> ... 2012-02-29 12:00:00 US 2</span>
<span class="sd"> ... 2012-03-31 12:00:00 US 3</span>
<span class="sd"> In case of Series,</span>
<span class="sd"> &gt;&gt;&gt; print(df.date.to_csv()) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> date</span>
<span class="sd"> 2012-01-31 12:00:00</span>
<span class="sd"> 2012-02-29 12:00:00</span>
<span class="sd"> 2012-03-31 12:00:00</span>
<span class="sd"> &gt;&gt;&gt; df.date.to_csv(path=r&#39;%s/to_csv/foo.csv&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/foo.csv&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date</span>
<span class="sd"> ... 2012-01-31 12:00:00</span>
<span class="sd"> ... 2012-02-29 12:00:00</span>
<span class="sd"> ... 2012-03-31 12:00:00</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; df.set_index(&quot;country&quot;, append=True, inplace=True)</span>
<span class="sd"> &gt;&gt;&gt; df.date.to_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/bar.csv&#39; % path,</span>
<span class="sd"> ... num_files=1,</span>
<span class="sd"> ... index_col=[&quot;index1&quot;, &quot;index2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/bar.csv&#39; % path, index_col=[&quot;index1&quot;, &quot;index2&quot;]</span>
<span class="sd"> ... ).sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date</span>
<span class="sd"> index1 index2</span>
<span class="sd"> ... ... 2012-01-31 12:00:00</span>
<span class="sd"> ... ... 2012-02-29 12:00:00</span>
<span class="sd"> ... ... 2012-03-31 12:00:00</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># If path is none, just collect and use pandas&#39;s to_csv.</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span>
<span class="kc">None</span><span class="p">,</span>
<span class="n">sep</span><span class="o">=</span><span class="n">sep</span><span class="p">,</span>
<span class="n">na_rep</span><span class="o">=</span><span class="n">na_rep</span><span class="p">,</span>
<span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">quotechar</span><span class="o">=</span><span class="n">quotechar</span><span class="p">,</span>
<span class="n">date_format</span><span class="o">=</span><span class="n">date_format</span><span class="p">,</span>
<span class="n">escapechar</span><span class="o">=</span><span class="n">escapechar</span><span class="p">,</span>
<span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="n">label</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">label</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">col</span><span class="p">,))</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="n">index_col</span>
<span class="k">if</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;to_csv only support one-level index column now&quot;</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">header</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="n">new_name</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">new_name</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">header</span><span class="p">))</span>
<span class="p">]</span>
<span class="p">)</span>
<span class="n">header</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">num_files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`num_files` has been deprecated and might be removed in a future version. &quot;</span>
<span class="s2">&quot;Use `DataFrame.spark.repartition` instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_files</span><span class="p">)</span>
<span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="n">builder</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">_set_opts</span><span class="p">(</span>
<span class="n">sep</span><span class="o">=</span><span class="n">sep</span><span class="p">,</span>
<span class="n">nullValue</span><span class="o">=</span><span class="n">na_rep</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">quote</span><span class="o">=</span><span class="n">quotechar</span><span class="p">,</span>
<span class="n">dateFormat</span><span class="o">=</span><span class="n">date_format</span><span class="p">,</span>
<span class="n">charToEscapeQuoteEscaping</span><span class="o">=</span><span class="n">escapechar</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;csv&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">compression</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;uncompressed&quot;</span><span class="p">,</span>
<span class="n">num_files</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;w&quot;</span><span class="p">,</span>
<span class="n">orient</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;records&quot;</span><span class="p">,</span>
<span class="n">lines</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert the object to a JSON string.</span>
<span class="sd"> .. note:: pandas-on-Spark `to_json` writes files to a path or URI. Unlike pandas&#39;,</span>
<span class="sd"> pandas-on-Spark respects HDFS&#39;s property such as &#39;fs.default.name&#39;.</span>
<span class="sd"> .. note:: pandas-on-Spark writes JSON files into the directory, `path`, and writes</span>
<span class="sd"> multiple `part-...` files in the directory when `path` is specified.</span>
<span class="sd"> This behaviour was inherited from Apache Spark. The number of files can</span>
<span class="sd"> be controlled by `num_files`.</span>
<span class="sd"> .. note:: output JSON format is different from pandas&#39;. It always use `orient=&#39;records&#39;`</span>
<span class="sd"> for its output. This behaviour might have to change in the near future.</span>
<span class="sd"> .. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values</span>
<span class="sd"> when writing JSON objects. It works only when `path` is provided.</span>
<span class="sd"> Note NaN&#39;s and None will be converted to null and datetime objects</span>
<span class="sd"> will be converted to UNIX timestamps.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string, optional</span>
<span class="sd"> File path. If not specified, the result is returned as</span>
<span class="sd"> a string.</span>
<span class="sd"> lines : bool, default True</span>
<span class="sd"> If ‘orient’ is ‘records’ write out line delimited json format.</span>
<span class="sd"> Will throw ValueError if incorrect ‘orient’ since others are not</span>
<span class="sd"> list like. It should be always True for now.</span>
<span class="sd"> orient : str, default &#39;records&#39;</span>
<span class="sd"> It should be always &#39;records&#39; for now.</span>
<span class="sd"> compression : {&#39;gzip&#39;, &#39;bz2&#39;, &#39;xz&#39;, None}</span>
<span class="sd"> A string representing the compression to use in the output file,</span>
<span class="sd"> only used when the first argument is a filename. By default, the</span>
<span class="sd"> compression is inferred from the filename.</span>
<span class="sd"> num_files : the number of files to be written in `path` directory when</span>
<span class="sd"> this is a path.</span>
<span class="sd"> mode : str</span>
<span class="sd"> Python write mode, default &#39;w&#39;.</span>
<span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span>
<span class="sd"> Such as &#39;append&#39;, &#39;overwrite&#39;, &#39;ignore&#39;, &#39;error&#39;, &#39;errorifexists&#39;.</span>
<span class="sd"> - &#39;append&#39; (equivalent to &#39;a&#39;): Append the new data to existing data.</span>
<span class="sd"> - &#39;overwrite&#39; (equivalent to &#39;w&#39;): Overwrite existing data.</span>
<span class="sd"> - &#39;ignore&#39;: Silently ignore this operation if data already exists.</span>
<span class="sd"> - &#39;error&#39; or &#39;errorifexists&#39;: Throw an exception if data already exists.</span>
<span class="sd"> partition_cols : str or list of str, optional, default None</span>
<span class="sd"> Names of partitioning columns</span>
<span class="sd"> index_col: str or list of str, optional, default: None</span>
<span class="sd"> Column names to be used in Spark to represent pandas-on-Spark&#39;s index. The index name</span>
<span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span>
<span class="sd"> options: keyword arguments for additional options specific to PySpark.</span>
<span class="sd"> It is specific to PySpark&#39;s JSON options to pass. Check</span>
<span class="sd"> the options in PySpark&#39;s API documentation for `spark.write.json(...)`.</span>
<span class="sd"> It has a higher priority and overwrites all other options.</span>
<span class="sd"> This parameter only works when `path` is specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str or None</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&#39;a&#39;, &#39;b&#39;], [&#39;c&#39;, &#39;d&#39;]],</span>
<span class="sd"> ... columns=[&#39;col 1&#39;, &#39;col 2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.to_json()</span>
<span class="sd"> &#39;[{&quot;col 1&quot;:&quot;a&quot;,&quot;col 2&quot;:&quot;b&quot;},{&quot;col 1&quot;:&quot;c&quot;,&quot;col 2&quot;:&quot;d&quot;}]&#39;</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;col 1&#39;].to_json()</span>
<span class="sd"> &#39;[{&quot;col 1&quot;:&quot;a&quot;},{&quot;col 1&quot;:&quot;c&quot;}]&#39;</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/to_json/foo.json&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/to_json/foo.json&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;)</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;col 1&#39;].to_json(path=r&#39;%s/to_json/foo.json&#39; % path, num_files=1, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/to_json/foo.json&#39; % path, index_col=&quot;index&quot;</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> col 1</span>
<span class="sd"> index</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 c</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">default_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;ignoreNullFields&quot;</span><span class="p">:</span> <span class="kc">False</span><span class="p">}</span>
<span class="n">options</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">default_options</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">}</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">lines</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;lines=False is not implemented yet.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">orient</span> <span class="o">!=</span> <span class="s2">&quot;records&quot;</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;orient=&#39;records&#39; is supported only for now.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># If path is none, just collect and use pandas&#39;s to_json.</span>
<span class="n">psdf_or_ser</span> <span class="o">=</span> <span class="bp">self</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf_or_ser</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="c1"># To make the format consistent and readable by `read_json`, convert it to pandas&#39; and</span>
<span class="c1"># use &#39;records&#39; orient for now.</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;records&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span>
<span class="k">if</span> <span class="n">num_files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`num_files` has been deprecated and might be removed in a future version. &quot;</span>
<span class="s2">&quot;Use `DataFrame.spark.repartition` instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_files</span><span class="p">)</span>
<span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="n">builder</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">_set_opts</span><span class="p">(</span><span class="n">compression</span><span class="o">=</span><span class="n">compression</span><span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;json&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">to_excel</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">excel_writer</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">],</span>
<span class="n">sheet_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;Sheet1&quot;</span><span class="p">,</span>
<span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span>
<span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">index_label</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">startrow</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">startcol</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">engine</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">merge_cells</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inf_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;inf&quot;</span><span class="p">,</span>
<span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">freeze_panes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write object to an Excel sheet.</span>
<span class="sd"> .. note:: This method should only be used if the resulting DataFrame is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> To write a single object to an Excel .xlsx file it is only necessary to</span>
<span class="sd"> specify a target file name. To write to multiple sheets it is necessary to</span>
<span class="sd"> create an `ExcelWriter` object with a target file name, and specify a sheet</span>
<span class="sd"> in the file to write to.</span>
<span class="sd"> Multiple sheets may be written to by specifying unique `sheet_name`.</span>
<span class="sd"> With all data written to the file it is necessary to save the changes.</span>
<span class="sd"> Note that creating an `ExcelWriter` object with a file name that already</span>
<span class="sd"> exists will result in the contents of the existing file being erased.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> excel_writer : str or ExcelWriter object</span>
<span class="sd"> File path or existing ExcelWriter.</span>
<span class="sd"> sheet_name : str, default &#39;Sheet1&#39;</span>
<span class="sd"> Name of sheet which will contain DataFrame.</span>
<span class="sd"> na_rep : str, default &#39;&#39;</span>
<span class="sd"> Missing data representation.</span>
<span class="sd"> float_format : str, optional</span>
<span class="sd"> Format string for floating point numbers. For example</span>
<span class="sd"> ``float_format=&quot;%%.2f&quot;`` will format 0.1234 to 0.12.</span>
<span class="sd"> columns : sequence or list of str, optional</span>
<span class="sd"> Columns to write.</span>
<span class="sd"> header : bool or list of str, default True</span>
<span class="sd"> Write out the column names. If a list of string is given it is</span>
<span class="sd"> assumed to be aliases for the column names.</span>
<span class="sd"> index : bool, default True</span>
<span class="sd"> Write row names (index).</span>
<span class="sd"> index_label : str or sequence, optional</span>
<span class="sd"> Column label for index column(s) if desired. If not specified, and</span>
<span class="sd"> `header` and `index` are True, then the index names are used. A</span>
<span class="sd"> sequence should be given if the DataFrame uses MultiIndex.</span>
<span class="sd"> startrow : int, default 0</span>
<span class="sd"> Upper left cell row to dump data frame.</span>
<span class="sd"> startcol : int, default 0</span>
<span class="sd"> Upper left cell column to dump data frame.</span>
<span class="sd"> engine : str, optional</span>
<span class="sd"> Write engine to use, &#39;openpyxl&#39; or &#39;xlsxwriter&#39;. You can also set this</span>
<span class="sd"> via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and</span>
<span class="sd"> ``io.excel.xlsm.writer``.</span>
<span class="sd"> merge_cells : bool, default True</span>
<span class="sd"> Write MultiIndex and Hierarchical Rows as merged cells.</span>
<span class="sd"> encoding : str, optional</span>
<span class="sd"> Encoding of the resulting excel file. Only necessary for xlwt,</span>
<span class="sd"> other writers support unicode natively.</span>
<span class="sd"> inf_rep : str, default &#39;inf&#39;</span>
<span class="sd"> Representation for infinity (there is no native representation for</span>
<span class="sd"> infinity in Excel).</span>
<span class="sd"> verbose : bool, default True</span>
<span class="sd"> Display more information in the error logs.</span>
<span class="sd"> freeze_panes : tuple of int (length 2), optional</span>
<span class="sd"> Specifies the one-based bottommost row and rightmost column that</span>
<span class="sd"> is to be frozen.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Once a workbook has been saved it is not possible write further data</span>
<span class="sd"> without rewriting the whole workbook.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_excel : Read Excel file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Create, write to and save a workbook:</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame([[&#39;a&#39;, &#39;b&#39;], [&#39;c&#39;, &#39;d&#39;]],</span>
<span class="sd"> ... index=[&#39;row 1&#39;, &#39;row 2&#39;],</span>
<span class="sd"> ... columns=[&#39;col 1&#39;, &#39;col 2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&quot;output.xlsx&quot;) # doctest: +SKIP</span>
<span class="sd"> To specify the sheet name:</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&quot;output.xlsx&quot;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&quot;output.xlsx&quot;,</span>
<span class="sd"> ... sheet_name=&#39;Sheet_name_1&#39;) # doctest: +SKIP</span>
<span class="sd"> If you wish to write to more than one sheet in the workbook, it is</span>
<span class="sd"> necessary to specify an ExcelWriter object:</span>
<span class="sd"> &gt;&gt;&gt; with pd.ExcelWriter(&#39;output.xlsx&#39;) as writer: # doctest: +SKIP</span>
<span class="sd"> ... df1.to_excel(writer, sheet_name=&#39;Sheet_name_1&#39;)</span>
<span class="sd"> ... df2.to_excel(writer, sheet_name=&#39;Sheet_name_2&#39;)</span>
<span class="sd"> To set the library that is used to write the Excel file,</span>
<span class="sd"> you can pass the `engine` keyword (the default engine is</span>
<span class="sd"> automatically chosen depending on the file extension):</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&#39;output1.xlsx&#39;, engine=&#39;xlsxwriter&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_excel` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting DataFrame is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="c1"># Make sure locals() call is at the top of the function so we don&#39;t capture local variables.</span>
<span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_excel</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_excel</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Constructor expects DataFrame or Series; however, &quot;</span> <span class="s2">&quot;got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="p">,)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_excel</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">args</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the mean of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> mean : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.mean()</span>
<span class="sd"> a 2.0</span>
<span class="sd"> b 0.2</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.mean(axis=1)</span>
<span class="sd"> 0 0.55</span>
<span class="sd"> 1 1.10</span>
<span class="sd"> 2 1.65</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].mean()</span>
<span class="sd"> 2.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">mean</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">sum</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the sum of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> min_count : int, default 0</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer than</span>
<span class="sd"> ``min_count`` non-NA values are present the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> sum : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, np.nan, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.sum()</span>
<span class="sd"> a 6.0</span>
<span class="sd"> b 0.4</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.sum(axis=1)</span>
<span class="sd"> 0 1.1</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 3.3</span>
<span class="sd"> 3 0.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.sum(min_count=3)</span>
<span class="sd"> a 6.0</span>
<span class="sd"> b NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.sum(axis=1, min_count=1)</span>
<span class="sd"> 0 1.1</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 3.3</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].sum()</span>
<span class="sd"> 6.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].sum(min_count=3)</span>
<span class="sd"> 6.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;b&#39;].sum(min_count=3)</span>
<span class="sd"> nan</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">spark_column</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="nb">sum</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;sum&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">product</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the product of the values.</span>
<span class="sd"> .. note:: unlike pandas&#39;, pandas-on-Spark&#39;s emulates product by ``exp(sum(log(...)))``</span>
<span class="sd"> trick. Therefore, it only works for positive numbers.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> min_count : int, default 0</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer than</span>
<span class="sd"> ``min_count`` non-NA values are present the result will be NA.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> Non-numeric type column is not included to the result.</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;A&#39;: [1, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;B&#39;: [10, 20, 30, 40, 50],</span>
<span class="sd"> ... &#39;C&#39;: [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 10 a</span>
<span class="sd"> 1 2 20 b</span>
<span class="sd"> 2 3 30 c</span>
<span class="sd"> 3 4 40 d</span>
<span class="sd"> 4 5 50 e</span>
<span class="sd"> &gt;&gt;&gt; psdf.prod()</span>
<span class="sd"> A 120</span>
<span class="sd"> B 12000000</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> If there is no numeric type columns, returns empty Series.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&quot;key&quot;: [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], &quot;val&quot;: [&#39;x&#39;, &#39;y&#39;, &#39;z&#39;]}).prod()</span>
<span class="sd"> Series([], dtype: float64)</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, 4, 5]).prod()</span>
<span class="sd"> 120</span>
<span class="sd"> By default, the product of an empty or all-NA Series is ``1``</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).prod()</span>
<span class="sd"> 1.0</span>
<span class="sd"> This can be controlled with the ``min_count`` parameter</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).prod(min_count=1)</span>
<span class="sd"> nan</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">prod</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">spark_column</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="n">num_zeros</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">spark_column</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
<span class="n">sign</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">spark_column</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">num_zeros</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span>
<span class="n">sign</span> <span class="o">*</span> <span class="n">F</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">spark_column</span><span class="p">))))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">IntegralType</span><span class="p">):</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">prod</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;prod&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span>
<span class="p">)</span>
<span class="n">prod</span> <span class="o">=</span> <span class="n">product</span>
<span class="k">def</span> <span class="nf">skew</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased skew normalized by N-1.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> skew : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.skew() # doctest: +SKIP</span>
<span class="sd"> a 0.000000e+00</span>
<span class="sd"> b -3.319678e-16</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].skew()</span>
<span class="sd"> 0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">skew</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">count_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="o">~</span><span class="n">spark_column</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span>
<span class="c1"># refer to the Pandas implementation &#39;nanskew&#39;</span>
<span class="c1"># https://github.com/pandas-dev/pandas/blob/main/pandas/core/nanops.py#L1152</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">count_scol</span> <span class="o">&gt;</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">F</span><span class="o">.</span><span class="n">skewness</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="o">*</span> <span class="n">F</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="mi">1</span> <span class="o">/</span> <span class="n">count_scol</span><span class="p">)</span>
<span class="o">*</span> <span class="p">(</span><span class="n">count_scol</span> <span class="o">/</span> <span class="p">(</span><span class="n">count_scol</span> <span class="o">-</span> <span class="mi">2</span><span class="p">)),</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">skew</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;skew&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).</span>
<span class="sd"> Normalized by N-1.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> kurt : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.kurtosis()</span>
<span class="sd"> a -1.5</span>
<span class="sd"> b -1.5</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].kurtosis()</span>
<span class="sd"> -1.5</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">kurtosis</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">kurtosis</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;kurtosis&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span>
<span class="p">)</span>
<span class="n">kurt</span> <span class="o">=</span> <span class="n">kurtosis</span>
<span class="k">def</span> <span class="nf">min</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the minimum of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span>
<span class="sd"> pandas compatibility. False is supported; however, the columns should</span>
<span class="sd"> be all numeric or all non-numeric.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> min : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.min()</span>
<span class="sd"> a 1.0</span>
<span class="sd"> b 0.1</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.min(axis=1)</span>
<span class="sd"> 0 0.1</span>
<span class="sd"> 1 0.2</span>
<span class="sd"> 2 0.3</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].min()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;min&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">max</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the maximum of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span>
<span class="sd"> pandas compatibility. False is supported; however, the columns should</span>
<span class="sd"> be all numeric or all non-numeric.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> max : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.max()</span>
<span class="sd"> a 3.0</span>
<span class="sd"> b 0.3</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.max(axis=1)</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 3.0</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].max()</span>
<span class="sd"> 3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;max&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">count</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Count non-NA cells for each column.</span>
<span class="sd"> The values `None`, `NaN` are considered NA.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or ‘index’, 1 or ‘columns’}, default 0</span>
<span class="sd"> If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are</span>
<span class="sd"> generated for each row.</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span>
<span class="sd"> pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> max : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.shape: Number of DataFrame rows and columns (including NA</span>
<span class="sd"> elements).</span>
<span class="sd"> DataFrame.isna: Boolean same-sized DataFrame showing places of NA</span>
<span class="sd"> elements.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Constructing DataFrame from a dictionary:</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;Person&quot;:</span>
<span class="sd"> ... [&quot;John&quot;, &quot;Myla&quot;, &quot;Lewis&quot;, &quot;John&quot;, &quot;Myla&quot;],</span>
<span class="sd"> ... &quot;Age&quot;: [24., np.nan, 21., 33, 26],</span>
<span class="sd"> ... &quot;Single&quot;: [False, True, True, True, False]},</span>
<span class="sd"> ... columns=[&quot;Person&quot;, &quot;Age&quot;, &quot;Single&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> Person Age Single</span>
<span class="sd"> 0 John 24.0 False</span>
<span class="sd"> 1 Myla NaN True</span>
<span class="sd"> 2 Lewis 21.0 True</span>
<span class="sd"> 3 John 33.0 True</span>
<span class="sd"> 4 Myla 26.0 False</span>
<span class="sd"> Notice the uncounted NA values:</span>
<span class="sd"> &gt;&gt;&gt; df.count()</span>
<span class="sd"> Person 5</span>
<span class="sd"> Age 4</span>
<span class="sd"> Single 5</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.count(axis=1)</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 3</span>
<span class="sd"> 3 3</span>
<span class="sd"> 4 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;Person&#39;].count()</span>
<span class="sd"> 5</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;Age&#39;].count()</span>
<span class="sd"> 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return sample standard deviation.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> std : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.std()</span>
<span class="sd"> a 1.0</span>
<span class="sd"> b 0.1</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.std(axis=1)</span>
<span class="sd"> 0 0.636396</span>
<span class="sd"> 1 1.272792</span>
<span class="sd"> 2 1.909188</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.std(ddof=0)</span>
<span class="sd"> a 0.816497</span>
<span class="sd"> b 0.081650</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].std()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].std(ddof=0)</span>
<span class="sd"> 0.816496580927726</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_pop</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_samp</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">std</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">var</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased variance.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> var : scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.var()</span>
<span class="sd"> a 1.00</span>
<span class="sd"> b 0.01</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.var(axis=1)</span>
<span class="sd"> 0 0.405</span>
<span class="sd"> 1 1.620</span>
<span class="sd"> 2 3.645</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.var(ddof=0)</span>
<span class="sd"> a 0.666667</span>
<span class="sd"> b 0.006667</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].var()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].var(ddof=0)</span>
<span class="sd"> 0.6666666666666666</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">var_pop</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">var_samp</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">var</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;var&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the median of the values for the requested axis.</span>
<span class="sd"> .. note:: Unlike pandas&#39;, the median in pandas-on-Spark is an approximated median based upon</span>
<span class="sd"> approximate percentile computation because computing median across a large dataset</span>
<span class="sd"> is extremely expensive.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> accuracy : int, optional</span>
<span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span>
<span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> median : scalar or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [24., 21., 25., 33., 26.], &#39;b&#39;: [1, 2, 3, 4, 5]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 24.0 1</span>
<span class="sd"> 1 21.0 2</span>
<span class="sd"> 2 25.0 3</span>
<span class="sd"> 3 33.0 4</span>
<span class="sd"> 4 26.0 5</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.median()</span>
<span class="sd"> a 25.0</span>
<span class="sd"> b 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].median()</span>
<span class="sd"> 25.0</span>
<span class="sd"> &gt;&gt;&gt; (df[&#39;b&#39;] + 100).median()</span>
<span class="sd"> 103.0</span>
<span class="sd"> For multi-index columns,</span>
<span class="sd"> &gt;&gt;&gt; df.columns = pd.MultiIndex.from_tuples([(&#39;x&#39;, &#39;a&#39;), (&#39;y&#39;, &#39;b&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> x y</span>
<span class="sd"> a b</span>
<span class="sd"> 0 24.0 1</span>
<span class="sd"> 1 21.0 2</span>
<span class="sd"> 2 25.0 3</span>
<span class="sd"> 3 33.0 4</span>
<span class="sd"> 4 26.0 5</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.median()</span>
<span class="sd"> x a 25.0</span>
<span class="sd"> y b 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.median(axis=1)</span>
<span class="sd"> 0 12.5</span>
<span class="sd"> 1 11.5</span>
<span class="sd"> 2 14.0</span>
<span class="sd"> 3 18.5</span>
<span class="sd"> 4 15.5</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[(&#39;x&#39;, &#39;a&#39;)].median()</span>
<span class="sd"> 25.0</span>
<span class="sd"> &gt;&gt;&gt; (df[(&#39;y&#39;, &#39;b&#39;)] + 100).median()</span>
<span class="sd"> 103.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="p">(</span><span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">median</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;median&quot;</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">sem</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased standard error of the mean over requested axis.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> numeric_only : bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> scalar(for Series) or Series(for DataFrame)</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&quot;a&quot;: [1, 2, 3], &quot;b&quot;: [4, 5, 6]})</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 4</span>
<span class="sd"> 1 2 5</span>
<span class="sd"> 2 3 6</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem()</span>
<span class="sd"> a 0.57735</span>
<span class="sd"> b 0.57735</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem(ddof=0)</span>
<span class="sd"> a 0.471405</span>
<span class="sd"> b 0.471405</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem(axis=1)</span>
<span class="sd"> 0 1.5</span>
<span class="sd"> 1 1.5</span>
<span class="sd"> 2 1.5</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> Support for Series</span>
<span class="sd"> &gt;&gt;&gt; psser = psdf.a</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 3</span>
<span class="sd"> Name: a, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; psser.sem()</span>
<span class="sd"> 0.5773502691896258</span>
<span class="sd"> &gt;&gt;&gt; psser.sem(ddof=0)</span>
<span class="sd"> 0.47140452079103173</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_pop</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_samp</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">std</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> <span class="o">/</span> <span class="nb">pow</span><span class="p">(</span><span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">),</span> <span class="mf">0.5</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">sem</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;sem&quot;</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an int representing the number of elements in this object.</span>
<span class="sd"> Return the number of rows if Series. Otherwise return the number of</span>
<span class="sd"> rows times number of columns if DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series({&#39;a&#39;: 1, &#39;b&#39;: 2, &#39;c&#39;: None})</span>
<span class="sd"> &gt;&gt;&gt; s.size</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;col1&#39;: [1, 2, None], &#39;col2&#39;: [3, 4, None]})</span>
<span class="sd"> &gt;&gt;&gt; df.size</span>
<span class="sd"> 6</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(index=[1, 2, None])</span>
<span class="sd"> &gt;&gt;&gt; df.size</span>
<span class="sd"> 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">num_columns</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span>
<span class="k">if</span> <span class="n">num_columns</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="mi">0</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">*</span> <span class="n">num_columns</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a Series/DataFrame with absolute numeric value of each element.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> abs : Series/DataFrame containing the absolute value of each element.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Absolute numeric values in a Series.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([-1.10, 2, -3.33, 4])</span>
<span class="sd"> &gt;&gt;&gt; s.abs()</span>
<span class="sd"> 0 1.10</span>
<span class="sd"> 1 2.00</span>
<span class="sd"> 2 3.33</span>
<span class="sd"> 3 4.00</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> Absolute numeric values in a DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [4, 5, 6, 7],</span>
<span class="sd"> ... &#39;b&#39;: [10, 20, 30, 40],</span>
<span class="sd"> ... &#39;c&#39;: [100, 50, -30, -50]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.abs()</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 4 10 100</span>
<span class="sd"> 1 5 20 50</span>
<span class="sd"> 2 6 30 30</span>
<span class="sd"> 3 7 40 50</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Column</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="k">return</span> <span class="n">psser</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span> <span class="n">field</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;bad operand type for abs(): </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">)&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">),</span>
<span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">(),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="nb">abs</span><span class="p">)</span>
<span class="c1"># TODO: by argument only support the grouping name and as_index only for now. Documentation</span>
<span class="c1"># should be updated when it&#39;s supported.</span>
<span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">by</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]]],</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupBy[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Group DataFrame or Series using one or more columns.</span>
<span class="sd"> A groupby operation involves some combination of splitting the</span>
<span class="sd"> object, applying a function, and combining the results. This can be</span>
<span class="sd"> used to group large amounts of data and compute operations on these</span>
<span class="sd"> groups.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> by : Series, label, or list of labels</span>
<span class="sd"> Used to determine the groups for the groupby.</span>
<span class="sd"> If Series is passed, the Series or dict VALUES</span>
<span class="sd"> will be used to determine the groups. A label or list of</span>
<span class="sd"> labels may be passed to group by the columns in ``self``.</span>
<span class="sd"> axis : int, default 0 or &#39;index&#39;</span>
<span class="sd"> Can only be set to 0 at the moment.</span>
<span class="sd"> as_index : bool, default True</span>
<span class="sd"> For aggregated output, return object with group labels as the</span>
<span class="sd"> index. Only relevant for DataFrame input. as_index=False is</span>
<span class="sd"> effectively &quot;SQL-style&quot; grouped output.</span>
<span class="sd"> dropna : bool, default True</span>
<span class="sd"> If True, and if group keys contain NA values,</span>
<span class="sd"> NA values together with row/column will be dropped.</span>
<span class="sd"> If False, NA values will also be treated as the key in groups.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrameGroupBy or SeriesGroupBy</span>
<span class="sd"> Depends on the calling object and returns groupby object that</span>
<span class="sd"> contains information about the groups.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.groupby.GroupBy</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;Animal&#39;: [&#39;Falcon&#39;, &#39;Falcon&#39;,</span>
<span class="sd"> ... &#39;Parrot&#39;, &#39;Parrot&#39;],</span>
<span class="sd"> ... &#39;Max Speed&#39;: [380., 370., 24., 26.]},</span>
<span class="sd"> ... columns=[&#39;Animal&#39;, &#39;Max Speed&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> Animal Max Speed</span>
<span class="sd"> 0 Falcon 380.0</span>
<span class="sd"> 1 Falcon 370.0</span>
<span class="sd"> 2 Parrot 24.0</span>
<span class="sd"> 3 Parrot 26.0</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;Animal&#39;]).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> Max Speed</span>
<span class="sd"> Animal</span>
<span class="sd"> Falcon 375.0</span>
<span class="sd"> Parrot 25.0</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;Animal&#39;], as_index=False).mean().sort_values(&#39;Animal&#39;)</span>
<span class="sd"> ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> Animal Max Speed</span>
<span class="sd"> ...Falcon 375.0</span>
<span class="sd"> ...Parrot 25.0</span>
<span class="sd"> We can also choose to include NA in group keys or not by setting dropna parameter,</span>
<span class="sd"> the default setting is True:</span>
<span class="sd"> &gt;&gt;&gt; l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(l, columns=[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(by=[&quot;b&quot;]).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a c</span>
<span class="sd"> b</span>
<span class="sd"> 1.0 2 3</span>
<span class="sd"> 2.0 2 5</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(by=[&quot;b&quot;], dropna=False).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a c</span>
<span class="sd"> b</span>
<span class="sd"> 1.0 2 3</span>
<span class="sd"> 2.0 2 5</span>
<span class="sd"> NaN 1 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">new_by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Label</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">]]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">by</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">by</span><span class="p">)</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">by</span><span class="p">)]</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">by</span><span class="p">)</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">by</span><span class="p">,))]</span>
<span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">key</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">key</span><span class="p">,)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_by</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;No group keys passed!&quot;</span><span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">&#39;axis should be either 0 or &quot;index&quot; currently.&#39;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_build_groupby</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">new_by</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_build_groupby</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupBy[FrameLike]&quot;</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">def</span> <span class="nf">bool</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the bool of a single element in the current object.</span>
<span class="sd"> This must be a boolean scalar value, either True or False. Raise a ValueError if</span>
<span class="sd"> the object does not have exactly 1 element, or that element is not boolean</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&#39;a&#39;: [True]}).bool()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([False]).bool()</span>
<span class="sd"> False</span>
<span class="sd"> If there are non-boolean or multiple values exist, it raises an exception in all</span>
<span class="sd"> cases as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&#39;a&#39;: [&#39;a&#39;]}).bool()</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: bool cannot act on a non-boolean single element DataFrame</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&#39;a&#39;: [True], &#39;b&#39;: [False]}).bool() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(),</span>
<span class="sd"> a.item(), a.any() or a.all().</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1]).bool()</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: bool cannot act on a non-boolean single element DataFrame</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">df</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_dataframe</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;bool() expects DataFrame or Series; however, &quot;</span> <span class="s2">&quot;got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="p">,))</span>
<span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">bool</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">first_valid_index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Retrieves the index of the first valid value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> scalar, tuple, or None</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Support for DataFrame</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;a&#39;: [None, 2, 3, 2],</span>
<span class="sd"> ... &#39;b&#39;: [None, 2.0, 3.0, 1.0],</span>
<span class="sd"> ... &#39;c&#39;: [None, 200, 400, 200]},</span>
<span class="sd"> ... index=[&#39;Q&#39;, &#39;W&#39;, &#39;E&#39;, &#39;R&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> Q NaN NaN NaN</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R 2.0 1.0 200.0</span>
<span class="sd"> &gt;&gt;&gt; psdf.first_valid_index()</span>
<span class="sd"> &#39;W&#39;</span>
<span class="sd"> Support for MultiIndex columns</span>
<span class="sd"> &gt;&gt;&gt; psdf.columns = pd.MultiIndex.from_tuples([(&#39;a&#39;, &#39;x&#39;), (&#39;b&#39;, &#39;y&#39;), (&#39;c&#39;, &#39;z&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> x y z</span>
<span class="sd"> Q NaN NaN NaN</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R 2.0 1.0 200.0</span>
<span class="sd"> &gt;&gt;&gt; psdf.first_valid_index()</span>
<span class="sd"> &#39;W&#39;</span>
<span class="sd"> Support for Series.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([None, None, 3, 4, 5], index=[100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 100 NaN</span>
<span class="sd"> 200 NaN</span>
<span class="sd"> 300 3.0</span>
<span class="sd"> 400 4.0</span>
<span class="sd"> 500 5.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.first_valid_index()</span>
<span class="sd"> 300</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([None, None, None, None, 250, 1.5, 320, 1, 0.3], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> lama speed NaN</span>
<span class="sd"> weight NaN</span>
<span class="sd"> length NaN</span>
<span class="sd"> cow speed NaN</span>
<span class="sd"> weight 250.0</span>
<span class="sd"> length 1.5</span>
<span class="sd"> falcon speed 320.0</span>
<span class="sd"> weight 1.0</span>
<span class="sd"> length 0.3</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.first_valid_index()</span>
<span class="sd"> (&#39;cow&#39;, &#39;weight&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">data_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="n">y</span><span class="p">,</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">data_spark_columns</span><span class="p">))</span>
<span class="k">with</span> <span class="n">sql_conf</span><span class="p">({</span><span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">:</span> <span class="kc">False</span><span class="p">}):</span>
<span class="c1"># Disable Arrow to keep row ordering.</span>
<span class="n">first_valid_row</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="o">.</span><span class="n">toPandas</span><span class="p">()</span>
<span class="p">)</span>
<span class="c1"># For Empty Series or DataFrame, returns None.</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">first_valid_row</span> <span class="o">=</span> <span class="n">first_valid_row</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">first_valid_row</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">last_valid_index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return index for last non-NA/null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> scalar, tuple, or None</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API only works with PySpark &gt;= 3.0.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Support for DataFrame</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;a&#39;: [1, 2, 3, None],</span>
<span class="sd"> ... &#39;b&#39;: [1.0, 2.0, 3.0, None],</span>
<span class="sd"> ... &#39;c&#39;: [100, 200, 400, None]},</span>
<span class="sd"> ... index=[&#39;Q&#39;, &#39;W&#39;, &#39;E&#39;, &#39;R&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> Q 1.0 1.0 100.0</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R NaN NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; psdf.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> &#39;E&#39;</span>
<span class="sd"> Support for MultiIndex columns</span>
<span class="sd"> &gt;&gt;&gt; psdf.columns = pd.MultiIndex.from_tuples([(&#39;a&#39;, &#39;x&#39;), (&#39;b&#39;, &#39;y&#39;), (&#39;c&#39;, &#39;z&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> x y z</span>
<span class="sd"> Q 1.0 1.0 100.0</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R NaN NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; psdf.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> &#39;E&#39;</span>
<span class="sd"> Support for Series.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([1, 2, 3, None, None], index=[100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 100 1.0</span>
<span class="sd"> 200 2.0</span>
<span class="sd"> 300 3.0</span>
<span class="sd"> 400 NaN</span>
<span class="sd"> 500 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> 300</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([250, 1.5, 320, 1, 0.3, None, None, None, None], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> lama speed 250.0</span>
<span class="sd"> weight 1.5</span>
<span class="sd"> length 320.0</span>
<span class="sd"> cow speed 1.0</span>
<span class="sd"> weight 0.3</span>
<span class="sd"> length NaN</span>
<span class="sd"> falcon speed NaN</span>
<span class="sd"> weight NaN</span>
<span class="sd"> length NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> (&#39;cow&#39;, &#39;weight&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">data_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="n">y</span><span class="p">,</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">data_spark_columns</span><span class="p">))</span>
<span class="n">last_valid_rows</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="p">)</span>
<span class="c1"># For Empty Series or DataFrame, returns None.</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">last_valid_rows</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">last_valid_row</span> <span class="o">=</span> <span class="n">last_valid_rows</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">last_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">last_valid_row</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">last_valid_row</span><span class="p">)</span>
<span class="c1"># TODO: &#39;center&#39;, &#39;win_type&#39;, &#39;on&#39;, &#39;axis&#39; parameter should be implemented.</span>
<span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Rolling[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide rolling transformations.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> in the near future.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> window : int, or offset</span>
<span class="sd"> Size of the moving window.</span>
<span class="sd"> This is the number of observations used for calculating the statistic.</span>
<span class="sd"> Each window will be a fixed size.</span>
<span class="sd"> min_periods : int, default None</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> For a window that is specified by an offset, min_periods will default to 1.</span>
<span class="sd"> Otherwise, min_periods will default to the size of the window.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> a Window sub-classed for the particular operation</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Rolling</span>
<span class="k">return</span> <span class="n">Rolling</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="o">=</span><span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="c1"># TODO: &#39;center&#39; and &#39;axis&#39; parameter should be implemented.</span>
<span class="c1"># &#39;axis&#39; implementation, refer https://github.com/pyspark.pandas/pull/607</span>
<span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Expanding[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide expanding transformations.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> in the near future.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> min_periods : int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> a Window sub-classed for the particular operation</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Expanding</span>
<span class="k">return</span> <span class="n">Expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Get item from object for given key (DataFrame column, Panel slice,</span>
<span class="sd"> etc.). Returns default value if not found.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> key : object</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> value : same type as items contained in object</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;x&#39;:range(3), &#39;y&#39;:[&#39;a&#39;,&#39;b&#39;,&#39;b&#39;], &#39;z&#39;:[&#39;a&#39;,&#39;b&#39;,&#39;b&#39;]},</span>
<span class="sd"> ... columns=[&#39;x&#39;, &#39;y&#39;, &#39;z&#39;], index=[10, 20, 20])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> x y z</span>
<span class="sd"> 10 0 a a</span>
<span class="sd"> 20 1 b b</span>
<span class="sd"> 20 2 b b</span>
<span class="sd"> &gt;&gt;&gt; df.get(&#39;x&#39;)</span>
<span class="sd"> 10 0</span>
<span class="sd"> 20 1</span>
<span class="sd"> 20 2</span>
<span class="sd"> Name: x, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.get([&#39;x&#39;, &#39;y&#39;])</span>
<span class="sd"> x y</span>
<span class="sd"> 10 0 a</span>
<span class="sd"> 20 1 b</span>
<span class="sd"> 20 2 b</span>
<span class="sd"> &gt;&gt;&gt; df.x.get(10)</span>
<span class="sd"> 0</span>
<span class="sd"> &gt;&gt;&gt; df.x.get(20)</span>
<span class="sd"> 20 1</span>
<span class="sd"> 20 2</span>
<span class="sd"> Name: x, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.x.get(15, -1)</span>
<span class="sd"> -1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
<span class="k">except</span> <span class="p">(</span><span class="ne">KeyError</span><span class="p">,</span> <span class="ne">ValueError</span><span class="p">,</span> <span class="ne">IndexError</span><span class="p">):</span>
<span class="k">return</span> <span class="n">default</span>
<span class="k">def</span> <span class="nf">squeeze</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Squeeze 1 dimensional axis objects into scalars.</span>
<span class="sd"> Series or DataFrames with a single element are squeezed to a scalar.</span>
<span class="sd"> DataFrames with a single column or a single row are squeezed to a</span>
<span class="sd"> Series. Otherwise the object is unchanged.</span>
<span class="sd"> This method is most useful when you don&#39;t know if your</span>
<span class="sd"> object is a Series or DataFrame, but you do know it has just a single</span>
<span class="sd"> column. In that case you can safely call `squeeze` to ensure you have a</span>
<span class="sd"> Series.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or &#39;index&#39;, 1 or &#39;columns&#39;, None}, default None</span>
<span class="sd"> A specific axis to squeeze. By default, all length-1 axes are</span>
<span class="sd"> squeezed.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame, Series, or scalar</span>
<span class="sd"> The projection after squeezing `axis` or all the axes.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.iloc : Integer-location based indexing for selecting scalars.</span>
<span class="sd"> DataFrame.iloc : Integer-location based indexing for selecting Series.</span>
<span class="sd"> Series.to_frame : Inverse of DataFrame.squeeze for a</span>
<span class="sd"> single-column DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; primes = ps.Series([2, 3, 5, 7])</span>
<span class="sd"> Slicing might produce a Series with a single value:</span>
<span class="sd"> &gt;&gt;&gt; even_primes = primes[primes % 2 == 0]</span>
<span class="sd"> &gt;&gt;&gt; even_primes</span>
<span class="sd"> 0 2</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; even_primes.squeeze()</span>
<span class="sd"> 2</span>
<span class="sd"> Squeezing objects with more than one value in every axis does nothing:</span>
<span class="sd"> &gt;&gt;&gt; odd_primes = primes[primes % 2 == 1]</span>
<span class="sd"> &gt;&gt;&gt; odd_primes</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 5</span>
<span class="sd"> 3 7</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; odd_primes.squeeze()</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 5</span>
<span class="sd"> 3 7</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> Squeezing is even more effective when used with DataFrames.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[1, 2], [3, 4]], columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 2</span>
<span class="sd"> 1 3 4</span>
<span class="sd"> Slicing a single column will produce a DataFrame with the columns</span>
<span class="sd"> having only one value:</span>
<span class="sd"> &gt;&gt;&gt; df_a = df[[&#39;a&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; df_a</span>
<span class="sd"> a</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 3</span>
<span class="sd"> So the columns can be squeezed down, resulting in a Series:</span>
<span class="sd"> &gt;&gt;&gt; df_a.squeeze(&#39;columns&#39;)</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 3</span>
<span class="sd"> Name: a, dtype: int64</span>
<span class="sd"> Slicing a single row from a single column will produce a single</span>
<span class="sd"> scalar DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df_1a = df.loc[[1], [&#39;a&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; df_1a</span>
<span class="sd"> a</span>
<span class="sd"> 1 3</span>
<span class="sd"> Squeezing the rows produces a single scalar Series:</span>
<span class="sd"> &gt;&gt;&gt; df_1a.squeeze(&#39;rows&#39;)</span>
<span class="sd"> a 3</span>
<span class="sd"> Name: 1, dtype: int64</span>
<span class="sd"> Squeezing all axes will project directly into a scalar:</span>
<span class="sd"> &gt;&gt;&gt; df_1a.squeeze()</span>
<span class="sd"> 3</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">axis</span> <span class="o">=</span> <span class="s2">&quot;index&quot;</span> <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="s2">&quot;rows&quot;</span> <span class="k">else</span> <span class="n">axis</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="n">is_squeezable</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">[:</span><span class="mi">2</span><span class="p">])</span> <span class="o">==</span> <span class="mi">1</span>
<span class="c1"># If DataFrame has multiple columns, there is no change.</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_squeezable</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="n">series_from_column</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">has_single_value</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">series_from_column</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> <span class="o">==</span> <span class="mi">1</span>
<span class="c1"># If DataFrame has only a single value, use pandas API directly.</span>
<span class="k">if</span> <span class="n">has_single_value</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">return</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="k">else</span> <span class="n">result</span>
<span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">series_from_column</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># The case of Series is simple.</span>
<span class="c1"># If Series has only a single value, just return it as a scalar.</span>
<span class="c1"># Otherwise, there is no change.</span>
<span class="n">self_top_two</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="n">has_single_value</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">self_top_two</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">self_top_two</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">has_single_value</span> <span class="k">else</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">truncate</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">before</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">after</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">copy</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrameOrSeries</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Truncate a Series or DataFrame before and after some index value.</span>
<span class="sd"> This is a useful shorthand for boolean indexing based on index</span>
<span class="sd"> values above or below certain thresholds.</span>
<span class="sd"> .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`</span>
<span class="sd"> which can be expensive.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> before : date, str, int</span>
<span class="sd"> Truncate all rows before this index value.</span>
<span class="sd"> after : date, str, int</span>
<span class="sd"> Truncate all rows after this index value.</span>
<span class="sd"> axis : {0 or &#39;index&#39;, 1 or &#39;columns&#39;}, optional</span>
<span class="sd"> Axis to truncate. Truncates the index (rows) by default.</span>
<span class="sd"> copy : bool, default is True,</span>
<span class="sd"> Return a copy of the truncated section.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> type of caller</span>
<span class="sd"> The truncated Series or DataFrame.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.loc : Select a subset of a DataFrame by label.</span>
<span class="sd"> DataFrame.iloc : Select a subset of a DataFrame by position.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;],</span>
<span class="sd"> ... &#39;B&#39;: [&#39;f&#39;, &#39;g&#39;, &#39;h&#39;, &#39;i&#39;, &#39;j&#39;],</span>
<span class="sd"> ... &#39;C&#39;: [&#39;k&#39;, &#39;l&#39;, &#39;m&#39;, &#39;n&#39;, &#39;o&#39;]},</span>
<span class="sd"> ... index=[1, 2, 3, 4, 5])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 1 a f k</span>
<span class="sd"> 2 b g l</span>
<span class="sd"> 3 c h m</span>
<span class="sd"> 4 d i n</span>
<span class="sd"> 5 e j o</span>
<span class="sd"> &gt;&gt;&gt; df.truncate(before=2, after=4)</span>
<span class="sd"> A B C</span>
<span class="sd"> 2 b g l</span>
<span class="sd"> 3 c h m</span>
<span class="sd"> 4 d i n</span>
<span class="sd"> The columns of a DataFrame can be truncated.</span>
<span class="sd"> &gt;&gt;&gt; df.truncate(before=&quot;A&quot;, after=&quot;B&quot;, axis=&quot;columns&quot;)</span>
<span class="sd"> A B</span>
<span class="sd"> 1 a f</span>
<span class="sd"> 2 b g</span>
<span class="sd"> 3 c h</span>
<span class="sd"> 4 d i</span>
<span class="sd"> 5 e j</span>
<span class="sd"> For Series, only rows can be truncated.</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;A&#39;].truncate(before=2, after=4)</span>
<span class="sd"> 2 b</span>
<span class="sd"> 3 c</span>
<span class="sd"> 4 d</span>
<span class="sd"> Name: A, dtype: object</span>
<span class="sd"> A Series has index that sorted integers.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([10, 20, 30, 40, 50, 60, 70],</span>
<span class="sd"> ... index=[1, 2, 3, 4, 5, 6, 7])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 1 10</span>
<span class="sd"> 2 20</span>
<span class="sd"> 3 30</span>
<span class="sd"> 4 40</span>
<span class="sd"> 5 50</span>
<span class="sd"> 6 60</span>
<span class="sd"> 7 70</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; s.truncate(2, 5)</span>
<span class="sd"> 2 20</span>
<span class="sd"> 3 30</span>
<span class="sd"> 4 40</span>
<span class="sd"> 5 50</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> A Series has index that sorted strings.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([10, 20, 30, 40, 50, 60, 70],</span>
<span class="sd"> ... index=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;, &#39;f&#39;, &#39;g&#39;])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> a 10</span>
<span class="sd"> b 20</span>
<span class="sd"> c 30</span>
<span class="sd"> d 40</span>
<span class="sd"> e 50</span>
<span class="sd"> f 60</span>
<span class="sd"> g 70</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; s.truncate(&#39;b&#39;, &#39;e&#39;)</span>
<span class="sd"> b 20</span>
<span class="sd"> c 30</span>
<span class="sd"> d 40</span>
<span class="sd"> e 50</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="n">indexes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span>
<span class="n">indexes_increasing</span> <span class="o">=</span> <span class="n">indexes</span><span class="o">.</span><span class="n">is_monotonic_increasing</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">indexes_increasing</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">indexes</span><span class="o">.</span><span class="n">is_monotonic_decreasing</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;truncate requires a sorted index&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="p">(</span><span class="n">before</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span><span class="n">after</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">if</span> <span class="p">(</span><span class="n">before</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">after</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="n">before</span> <span class="o">&gt;</span> <span class="n">after</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Truncate: </span><span class="si">%s</span><span class="s2"> must be after </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">after</span><span class="p">,</span> <span class="n">before</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">indexes_increasing</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[arg-type, assignment]</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">after</span><span class="p">:</span><span class="n">before</span><span class="p">]</span> <span class="c1"># type: ignore[arg-type,assignment]</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">if</span> <span class="n">indexes_increasing</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">after</span><span class="p">:</span><span class="n">before</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">result</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="n">result</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">to_markdown</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Print Series or DataFrame in Markdown-friendly format.</span>
<span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> buf : writable buffer, defaults to sys.stdout</span>
<span class="sd"> Where to send the output. By default, the output is printed to</span>
<span class="sd"> sys.stdout. Pass a writable buffer if you need to further process</span>
<span class="sd"> the output.</span>
<span class="sd"> mode : str, optional</span>
<span class="sd"> Mode in which file is opened.</span>
<span class="sd"> **kwargs</span>
<span class="sd"> These parameters will be passed to `tabulate`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str</span>
<span class="sd"> Series or DataFrame in Markdown-friendly format.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Requires the `tabulate &lt;https://pypi.org/project/tabulate&gt;`_ package.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&quot;elk&quot;, &quot;pig&quot;, &quot;dog&quot;, &quot;quetzal&quot;], name=&quot;animal&quot;)</span>
<span class="sd"> &gt;&gt;&gt; print(psser.to_markdown()) # doctest: +SKIP</span>
<span class="sd"> | | animal |</span>
<span class="sd"> |---:|:---------|</span>
<span class="sd"> | 0 | elk |</span>
<span class="sd"> | 1 | pig |</span>
<span class="sd"> | 2 | dog |</span>
<span class="sd"> | 3 | quetzal |</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame(</span>
<span class="sd"> ... data={&quot;animal_1&quot;: [&quot;elk&quot;, &quot;pig&quot;], &quot;animal_2&quot;: [&quot;dog&quot;, &quot;quetzal&quot;]}</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; print(psdf.to_markdown()) # doctest: +SKIP</span>
<span class="sd"> | | animal_1 | animal_2 |</span>
<span class="sd"> |---:|:-----------|:-----------|</span>
<span class="sd"> | 0 | elk | dog |</span>
<span class="sd"> | 1 | pig | quetzal |</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_markdown` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting pandas object is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="c1"># Make sure locals() call is at the top of the function so we don&#39;t capture local variables.</span>
<span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span>
<span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="n">internal_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span>
<span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span>
<span class="n">internal_pandas</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">internal_pandas</span><span class="p">)</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">,</span> <span class="n">args</span>
<span class="p">)</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># TODO: add &#39;downcast&#39; when value parameter exists</span>
<span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`bfill```.</span>
<span class="sd"> .. note:: the current implementation of &#39;bfill&#39; uses Spark&#39;s Window</span>
<span class="sd"> without specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> DataFrame or Series with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [None, 3, None, None],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 NaN 2.0 NaN 0</span>
<span class="sd"> 1 3.0 4.0 NaN 1</span>
<span class="sd"> 2 NaN NaN NaN 5</span>
<span class="sd"> 3 NaN 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values backward.</span>
<span class="sd"> &gt;&gt;&gt; psdf.bfill()</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 3.0 2.0 1.0 0</span>
<span class="sd"> 1 3.0 4.0 1.0 1</span>
<span class="sd"> 2 NaN 3.0 1.0 5</span>
<span class="sd"> 3 NaN 3.0 1.0 4</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([None, None, None, 1])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 NaN</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psser.bfill()</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 1.0</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;bfill&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
<span class="n">backfill</span> <span class="o">=</span> <span class="n">bfill</span>
<span class="c1"># TODO: add &#39;downcast&#39; when value parameter exists</span>
<span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`ffill```.</span>
<span class="sd"> .. note:: the current implementation of &#39;ffill&#39; uses Spark&#39;s Window</span>
<span class="sd"> without specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> DataFrame or Series with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [None, 3, None, None],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 NaN 2.0 NaN 0</span>
<span class="sd"> 1 3.0 4.0 NaN 1</span>
<span class="sd"> 2 NaN NaN NaN 5</span>
<span class="sd"> 3 NaN 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values forward.</span>
<span class="sd"> &gt;&gt;&gt; psdf.ffill()</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 NaN 2.0 NaN 0</span>
<span class="sd"> 1 3.0 4.0 NaN 1</span>
<span class="sd"> 2 3.0 4.0 NaN 5</span>
<span class="sd"> 3 3.0 3.0 1.0 4</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([2, 4, None, 3])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 4.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psser.ffill()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 4.0</span>
<span class="sd"> 2 4.0</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;ffill&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
<span class="n">pad</span> <span class="o">=</span> <span class="n">ffill</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">at</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">AtIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">AtIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">at</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">AtIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">iat</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">iAtIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">iAtIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">iat</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">iAtIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">iloc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">iLocIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">iLocIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">iloc</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">iLocIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">loc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">LocIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">LocIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">loc</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">LocIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="k">def</span> <span class="fm">__bool__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">NoReturn</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;The truth value of a </span><span class="si">{0}</span><span class="s2"> is ambiguous. &quot;</span>
<span class="s2">&quot;Use a.empty, a.bool(), a.item(), a.any() or a.all().&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">shutil</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.generic</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">generic</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.generic tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">generic</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<div class='prev-next-bottom'>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>