| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.pandas.generic — PySpark 3.3.4 documentation</title> |
| |
| <link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/generic.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.pandas.generic</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">A base class of DataFrame/Column to behave similar to pandas DataFrame/Series.</span> |
| <span class="sd">"""</span> |
| <span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span> |
| <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">Counter</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Iterable</span><span class="p">,</span> |
| <span class="n">IO</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">NoReturn</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">BooleanType</span><span class="p">,</span> |
| <span class="n">DoubleType</span><span class="p">,</span> |
| <span class="n">IntegralType</span><span class="p">,</span> |
| <span class="n">LongType</span><span class="p">,</span> |
| <span class="n">NumericType</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Axis</span><span class="p">,</span> |
| <span class="n">DataFrameOrSeries</span><span class="p">,</span> |
| <span class="n">Dtype</span><span class="p">,</span> |
| <span class="n">FrameLike</span><span class="p">,</span> |
| <span class="n">Label</span><span class="p">,</span> |
| <span class="n">Name</span><span class="p">,</span> |
| <span class="n">Scalar</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexing</span> <span class="kn">import</span> <span class="n">AtIndexer</span><span class="p">,</span> <span class="n">iAtIndexer</span><span class="p">,</span> <span class="n">iLocIndexer</span><span class="p">,</span> <span class="n">LocIndexer</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="n">InternalFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">spark_type_to_pandas_dtype</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">is_name_like_tuple</span><span class="p">,</span> |
| <span class="n">is_name_like_value</span><span class="p">,</span> |
| <span class="n">name_like_string</span><span class="p">,</span> |
| <span class="n">scol_for</span><span class="p">,</span> |
| <span class="n">sql_conf</span><span class="p">,</span> |
| <span class="n">validate_arguments_and_invoke_function</span><span class="p">,</span> |
| <span class="n">validate_axis</span><span class="p">,</span> |
| <span class="n">validate_mode</span><span class="p">,</span> |
| <span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">,</span> |
| <span class="n">log_advice</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.base</span> <span class="kn">import</span> <span class="n">Index</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">GroupBy</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Rolling</span><span class="p">,</span> <span class="n">Expanding</span> |
| |
| |
| <span class="n">bool_type</span> <span class="o">=</span> <span class="nb">bool</span> |
| |
| |
| <span class="k">class</span> <span class="nc">Frame</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> The base class for both DataFrame and Series.</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"Series"</span><span class="p">],</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Column</span><span class="p">]],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"Series"</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Scalar</span><span class="p">]:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Index"</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_to_internal_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="c1"># TODO: add 'axis' parameter</span> |
| <span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return cumulative minimum over a DataFrame or Series axis.</span> |
| |
| <span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative minimum.</span> |
| |
| <span class="sd"> .. note:: the current implementation of cummin uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.min : Return the minimum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span> |
| <span class="sd"> Series.min : Return the minimum over Series axis.</span> |
| <span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span> |
| <span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span> |
| <span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span> |
| <span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 3.0 NaN</span> |
| <span class="sd"> 2 1.0 0.0</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the minimum in each column.</span> |
| |
| <span class="sd"> >>> df.cummin()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 2.0 NaN</span> |
| <span class="sd"> 2 1.0 0.0</span> |
| |
| <span class="sd"> It works identically in Series.</span> |
| |
| <span class="sd"> >>> df.A.cummin()</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: add 'axis' parameter</span> |
| <span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return cumulative maximum over a DataFrame or Series axis.</span> |
| |
| <span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative maximum.</span> |
| |
| <span class="sd"> .. note:: the current implementation of cummax uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.max : Return the maximum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumprod : Return cumulative product over DataFrame axis.</span> |
| <span class="sd"> Series.max : Return the maximum over Series axis.</span> |
| <span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span> |
| <span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span> |
| <span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span> |
| <span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 3.0 NaN</span> |
| <span class="sd"> 2 1.0 0.0</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the maximum in each column.</span> |
| |
| <span class="sd"> >>> df.cummax()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 3.0 NaN</span> |
| <span class="sd"> 2 3.0 1.0</span> |
| |
| <span class="sd"> It works identically in Series.</span> |
| |
| <span class="sd"> >>> df.B.cummax()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 NaN</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: add 'axis' parameter</span> |
| <span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return cumulative sum over a DataFrame or Series axis.</span> |
| |
| <span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative sum.</span> |
| |
| <span class="sd"> .. note:: the current implementation of cumsum uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.sum : Return the sum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumprod : Return cumulative product over DataFrame axis.</span> |
| <span class="sd"> Series.sum : Return the sum over Series axis.</span> |
| <span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span> |
| <span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span> |
| <span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span> |
| <span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list('AB'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 3.0 NaN</span> |
| <span class="sd"> 2 1.0 0.0</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.cumsum()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 5.0 NaN</span> |
| <span class="sd"> 2 6.0 1.0</span> |
| |
| <span class="sd"> It works identically in Series.</span> |
| |
| <span class="sd"> >>> df.A.cumsum()</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 5.0</span> |
| <span class="sd"> 2 6.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: add 'axis' parameter</span> |
| <span class="c1"># TODO: use pandas_udf to support negative values and other options later</span> |
| <span class="c1"># other window except unbounded ones is supported as of Spark 3.0.</span> |
| <span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return cumulative product over a DataFrame or Series axis.</span> |
| |
| <span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative product.</span> |
| |
| <span class="sd"> .. note:: the current implementation of cumprod uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> .. note:: unlike pandas', pandas-on-Spark's emulates cumulative product by</span> |
| <span class="sd"> ``exp(sum(log(...)))`` trick. Therefore, it only works for positive numbers.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.cummax : Return cumulative maximum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cummin : Return cumulative minimum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumsum : Return cumulative sum over DataFrame axis.</span> |
| <span class="sd"> DataFrame.cumprod : Return cumulative product over DataFrame axis.</span> |
| <span class="sd"> Series.cummax : Return cumulative maximum over Series axis.</span> |
| <span class="sd"> Series.cummin : Return cumulative minimum over Series axis.</span> |
| <span class="sd"> Series.cumsum : Return cumulative sum over Series axis.</span> |
| <span class="sd"> Series.cumprod : Return cumulative product over Series axis.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> Exception : If the values is equal to or lower than 0.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[2.0, 1.0], [3.0, None], [4.0, 10.0]], columns=list('AB'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 3.0 NaN</span> |
| <span class="sd"> 2 4.0 10.0</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.cumprod()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 1.0</span> |
| <span class="sd"> 1 6.0 NaN</span> |
| <span class="sd"> 2 24.0 10.0</span> |
| |
| <span class="sd"> It works identically in Series.</span> |
| |
| <span class="sd"> >>> df.A.cumprod()</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 6.0</span> |
| <span class="sd"> 2 24.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: Although this has removed pandas >= 1.0.0, but we're keeping this as deprecated</span> |
| <span class="c1"># since we're using this for `DataFrame.info` internally.</span> |
| <span class="c1"># We can drop it once our minimal pandas version becomes 1.0.0.</span> |
| <span class="k">def</span> <span class="nf">get_dtype_counts</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return counts of unique dtypes in this object.</span> |
| |
| <span class="sd"> .. deprecated:: 0.14.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dtype : pd.Series</span> |
| <span class="sd"> Series with the count of columns with each dtype.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> dtypes : Return the dtypes in this object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> a = [['a', 1, 1], ['b', 2, 2], ['c', 3, 3]]</span> |
| <span class="sd"> >>> df = ps.DataFrame(a, columns=['str', 'int1', 'int2'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> str int1 int2</span> |
| <span class="sd"> 0 a 1 1</span> |
| <span class="sd"> 1 b 2 2</span> |
| <span class="sd"> 2 c 3 3</span> |
| |
| <span class="sd"> >>> df.get_dtype_counts().sort_values()</span> |
| <span class="sd"> object 1</span> |
| <span class="sd"> int64 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> df.str.get_dtype_counts().sort_values()</span> |
| <span class="sd"> object 1</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"`get_dtype_counts` has been deprecated and will be "</span> |
| <span class="s2">"removed in a future version. For DataFrames use "</span> |
| <span class="s2">"`.dtypes.value_counts()"</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">):</span> |
| <span class="n">dtypes</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">dtypes</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtypes</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="n">Counter</span><span class="p">([</span><span class="n">d</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">d</span> <span class="ow">in</span> <span class="n">dtypes</span><span class="p">])))</span> |
| |
| <span class="k">def</span> <span class="nf">pipe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Apply func(self, \*args, \*\*kwargs).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> function to apply to the DataFrame.</span> |
| <span class="sd"> ``args``, and ``kwargs`` are passed into ``func``.</span> |
| <span class="sd"> Alternatively a ``(callable, data_keyword)`` tuple where</span> |
| <span class="sd"> ``data_keyword`` is a string indicating the keyword of</span> |
| <span class="sd"> ``callable`` that expects the DataFrames.</span> |
| <span class="sd"> args : iterable, optional</span> |
| <span class="sd"> positional arguments passed into ``func``.</span> |
| <span class="sd"> kwargs : mapping, optional</span> |
| <span class="sd"> a dictionary of keyword arguments passed into ``func``.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> object : the return type of ``func``.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Use ``.pipe`` when chaining together functions that expect</span> |
| <span class="sd"> Series, DataFrames or GroupBy objects. For example, given</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'category': ['A', 'A', 'B'],</span> |
| <span class="sd"> ... 'col1': [1, 2, 3],</span> |
| <span class="sd"> ... 'col2': [4, 5, 6]},</span> |
| <span class="sd"> ... columns=['category', 'col1', 'col2'])</span> |
| <span class="sd"> >>> def keep_category_a(df):</span> |
| <span class="sd"> ... return df[df['category'] == 'A']</span> |
| <span class="sd"> >>> def add_one(df, column):</span> |
| <span class="sd"> ... return df.assign(col3=df[column] + 1)</span> |
| <span class="sd"> >>> def multiply(df, column1, column2):</span> |
| <span class="sd"> ... return df.assign(col4=df[column1] * df[column2])</span> |
| |
| |
| <span class="sd"> instead of writing</span> |
| |
| <span class="sd"> >>> multiply(add_one(keep_category_a(df), column="col1"), column1="col2", column2="col3")</span> |
| <span class="sd"> category col1 col2 col3 col4</span> |
| <span class="sd"> 0 A 1 4 2 8</span> |
| <span class="sd"> 1 A 2 5 3 15</span> |
| |
| |
| <span class="sd"> You can write</span> |
| |
| <span class="sd"> >>> (df.pipe(keep_category_a)</span> |
| <span class="sd"> ... .pipe(add_one, column="col1")</span> |
| <span class="sd"> ... .pipe(multiply, column1="col2", column2="col3")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> category col1 col2 col3 col4</span> |
| <span class="sd"> 0 A 1 4 2 8</span> |
| <span class="sd"> 1 A 2 5 3 15</span> |
| |
| |
| <span class="sd"> If you have a function that takes the data as (say) the second</span> |
| <span class="sd"> argument, pass a tuple indicating which keyword expects the</span> |
| <span class="sd"> data. For example, suppose ``f`` takes its data as ``df``:</span> |
| |
| <span class="sd"> >>> def multiply_2(column1, df, column2):</span> |
| <span class="sd"> ... return df.assign(col4=df[column1] * df[column2])</span> |
| |
| |
| <span class="sd"> Then you can write</span> |
| |
| <span class="sd"> >>> (df.pipe(keep_category_a)</span> |
| <span class="sd"> ... .pipe(add_one, column="col1")</span> |
| <span class="sd"> ... .pipe((multiply_2, 'df'), column1="col2", column2="col3")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> category col1 col2 col3 col4</span> |
| <span class="sd"> 0 A 1 4 2 8</span> |
| <span class="sd"> 1 A 2 5 3 15</span> |
| |
| <span class="sd"> You can use lambda as wel</span> |
| |
| <span class="sd"> >>> ps.Series([1, 2, 3]).pipe(lambda x: (x + 1).rename("value"))</span> |
| <span class="sd"> 0 2</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 4</span> |
| <span class="sd"> Name: value, dtype: int64</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">func</span><span class="p">,</span> <span class="n">target</span> <span class="o">=</span> <span class="n">func</span> |
| <span class="k">if</span> <span class="n">target</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> is both the pipe target and a keyword "</span> <span class="s2">"argument"</span> <span class="o">%</span> <span class="n">target</span><span class="p">)</span> |
| <span class="n">kwargs</span><span class="p">[</span><span class="n">target</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">to_numpy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> A NumPy ndarray representing the values in this DataFrame or Series.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> numpy.ndarray</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()</span> |
| <span class="sd"> array([[1, 3],</span> |
| <span class="sd"> [2, 4]])</span> |
| |
| <span class="sd"> With heterogeneous data, the lowest common type will have to be used.</span> |
| |
| <span class="sd"> >>> ps.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}).to_numpy()</span> |
| <span class="sd"> array([[1. , 3. ],</span> |
| <span class="sd"> [2. , 4.5]])</span> |
| |
| <span class="sd"> For a mix of numeric and non-numeric types, the output array will have object dtype.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2], "B": [3.0, 4.5], "C": pd.date_range('2000', periods=2)})</span> |
| <span class="sd"> >>> df.to_numpy()</span> |
| <span class="sd"> array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],</span> |
| <span class="sd"> [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)</span> |
| |
| <span class="sd"> For Series,</span> |
| |
| <span class="sd"> >>> ps.Series(['a', 'b', 'a']).to_numpy()</span> |
| <span class="sd"> array(['a', 'b', 'a'], dtype=object)</span> |
| <span class="sd"> """</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"`to_numpy` loads all data into the driver's memory. "</span> |
| <span class="s2">"It should only be used if the resulting NumPy ndarray is expected to be small."</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">values</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">values</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return a Numpy representation of the DataFrame or the Series.</span> |
| |
| <span class="sd"> .. warning:: We recommend using `DataFrame.to_numpy()` or `Series.to_numpy()` instead.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> numpy.ndarray</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> A DataFrame where all columns are the same type (e.g., int64) results in an array of</span> |
| <span class="sd"> the same type.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'age': [ 3, 29],</span> |
| <span class="sd"> ... 'height': [94, 170],</span> |
| <span class="sd"> ... 'weight': [31, 115]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> age height weight</span> |
| <span class="sd"> 0 3 94 31</span> |
| <span class="sd"> 1 29 170 115</span> |
| <span class="sd"> >>> df.dtypes</span> |
| <span class="sd"> age int64</span> |
| <span class="sd"> height int64</span> |
| <span class="sd"> weight int64</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> df.values</span> |
| <span class="sd"> array([[ 3, 94, 31],</span> |
| <span class="sd"> [ 29, 170, 115]])</span> |
| |
| <span class="sd"> A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray</span> |
| <span class="sd"> of the broadest type that accommodates these mixed types (e.g., object).</span> |
| |
| <span class="sd"> >>> df2 = ps.DataFrame([('parrot', 24.0, 'second'),</span> |
| <span class="sd"> ... ('lion', 80.5, 'first'),</span> |
| <span class="sd"> ... ('monkey', np.nan, None)],</span> |
| <span class="sd"> ... columns=('name', 'max_speed', 'rank'))</span> |
| <span class="sd"> >>> df2.dtypes</span> |
| <span class="sd"> name object</span> |
| <span class="sd"> max_speed float64</span> |
| <span class="sd"> rank object</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> df2.values</span> |
| <span class="sd"> array([['parrot', 24.0, 'second'],</span> |
| <span class="sd"> ['lion', 80.5, 'first'],</span> |
| <span class="sd"> ['monkey', nan, None]], dtype=object)</span> |
| |
| <span class="sd"> For Series,</span> |
| |
| <span class="sd"> >>> ps.Series([1, 2, 3]).values</span> |
| <span class="sd"> array([1, 2, 3])</span> |
| |
| <span class="sd"> >>> ps.Series(list('aabc')).values</span> |
| <span class="sd"> array(['a', 'a', 'b', 'c'], dtype=object)</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"We recommend using `</span><span class="si">{}</span><span class="s2">.to_numpy()` instead."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_numpy</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">to_csv</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">","</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">""</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">quotechar</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">'"'</span><span class="p">,</span> |
| <span class="n">date_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">escapechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">num_files</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"w"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Write object to a comma-separated values (csv) file.</span> |
| |
| <span class="sd"> .. note:: pandas-on-Spark `to_csv` writes files to a path or URI. Unlike pandas',</span> |
| <span class="sd"> pandas-on-Spark respects HDFS's property such as 'fs.default.name'.</span> |
| |
| <span class="sd"> .. note:: pandas-on-Spark writes CSV files into the directory, `path`, and writes</span> |
| <span class="sd"> multiple `part-...` files in the directory when `path` is specified.</span> |
| <span class="sd"> This behaviour was inherited from Apache Spark. The number of files can</span> |
| <span class="sd"> be controlled by `num_files`.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : str, default None</span> |
| <span class="sd"> File path. If None is provided the result is returned as a string.</span> |
| <span class="sd"> sep : str, default ','</span> |
| <span class="sd"> String of length 1. Field delimiter for the output file.</span> |
| <span class="sd"> na_rep : str, default ''</span> |
| <span class="sd"> Missing data representation.</span> |
| <span class="sd"> columns : sequence, optional</span> |
| <span class="sd"> Columns to write.</span> |
| <span class="sd"> header : bool or list of str, default True</span> |
| <span class="sd"> Write out the column names. If a list of strings is given it is</span> |
| <span class="sd"> assumed to be aliases for the column names.</span> |
| <span class="sd"> quotechar : str, default '\"'</span> |
| <span class="sd"> String of length 1. Character used to quote fields.</span> |
| <span class="sd"> date_format : str, default None</span> |
| <span class="sd"> Format string for datetime objects.</span> |
| <span class="sd"> escapechar : str, default None</span> |
| <span class="sd"> String of length 1. Character used to escape `sep` and `quotechar`</span> |
| <span class="sd"> when appropriate.</span> |
| <span class="sd"> num_files : the number of files to be written in `path` directory when</span> |
| <span class="sd"> this is a path.</span> |
| <span class="sd"> mode : str</span> |
| <span class="sd"> Python write mode, default 'w'.</span> |
| |
| <span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span> |
| <span class="sd"> Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.</span> |
| |
| <span class="sd"> - 'append' (equivalent to 'a'): Append the new data to existing data.</span> |
| <span class="sd"> - 'overwrite' (equivalent to 'w'): Overwrite existing data.</span> |
| <span class="sd"> - 'ignore': Silently ignore this operation if data already exists.</span> |
| <span class="sd"> - 'error' or 'errorifexists': Throw an exception if data already exists.</span> |
| |
| <span class="sd"> partition_cols : str or list of str, optional, default None</span> |
| <span class="sd"> Names of partitioning columns</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Column names to be used in Spark to represent pandas-on-Spark's index. The index name</span> |
| <span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span> |
| <span class="sd"> options: keyword arguments for additional options specific to PySpark.</span> |
| <span class="sd"> This kwargs are specific to PySpark's CSV options to pass. Check</span> |
| <span class="sd"> the options in PySpark's API documentation for spark.write.csv(...).</span> |
| <span class="sd"> It has higher priority and overwrites all other options.</span> |
| <span class="sd"> This parameter only works when `path` is specified.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str or None</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_csv</span> |
| <span class="sd"> DataFrame.to_delta</span> |
| <span class="sd"> DataFrame.to_table</span> |
| <span class="sd"> DataFrame.to_parquet</span> |
| <span class="sd"> DataFrame.to_spark_io</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(dict(</span> |
| <span class="sd"> ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),</span> |
| <span class="sd"> ... country=['KR', 'US', 'JP'],</span> |
| <span class="sd"> ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])</span> |
| <span class="sd"> >>> df.sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> date country code</span> |
| <span class="sd"> ... 2012-01-31 12:00:00 KR 1</span> |
| <span class="sd"> ... 2012-02-29 12:00:00 US 2</span> |
| <span class="sd"> ... 2012-03-31 12:00:00 JP 3</span> |
| |
| <span class="sd"> >>> print(df.to_csv()) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> date,country,code</span> |
| <span class="sd"> 2012-01-31 12:00:00,KR,1</span> |
| <span class="sd"> 2012-02-29 12:00:00,US,2</span> |
| <span class="sd"> 2012-03-31 12:00:00,JP,3</span> |
| |
| <span class="sd"> >>> df.cummax().to_csv(path=r'%s/to_csv/foo.csv' % path, num_files=1)</span> |
| <span class="sd"> >>> ps.read_csv(</span> |
| <span class="sd"> ... path=r'%s/to_csv/foo.csv' % path</span> |
| <span class="sd"> ... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> date country code</span> |
| <span class="sd"> ... 2012-01-31 12:00:00 KR 1</span> |
| <span class="sd"> ... 2012-02-29 12:00:00 US 2</span> |
| <span class="sd"> ... 2012-03-31 12:00:00 US 3</span> |
| |
| <span class="sd"> In case of Series,</span> |
| |
| <span class="sd"> >>> print(df.date.to_csv()) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> date</span> |
| <span class="sd"> 2012-01-31 12:00:00</span> |
| <span class="sd"> 2012-02-29 12:00:00</span> |
| <span class="sd"> 2012-03-31 12:00:00</span> |
| |
| <span class="sd"> >>> df.date.to_csv(path=r'%s/to_csv/foo.csv' % path, num_files=1)</span> |
| <span class="sd"> >>> ps.read_csv(</span> |
| <span class="sd"> ... path=r'%s/to_csv/foo.csv' % path</span> |
| <span class="sd"> ... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> date</span> |
| <span class="sd"> ... 2012-01-31 12:00:00</span> |
| <span class="sd"> ... 2012-02-29 12:00:00</span> |
| <span class="sd"> ... 2012-03-31 12:00:00</span> |
| |
| <span class="sd"> You can preserve the index in the roundtrip as below.</span> |
| |
| <span class="sd"> >>> df.set_index("country", append=True, inplace=True)</span> |
| <span class="sd"> >>> df.date.to_csv(</span> |
| <span class="sd"> ... path=r'%s/to_csv/bar.csv' % path,</span> |
| <span class="sd"> ... num_files=1,</span> |
| <span class="sd"> ... index_col=["index1", "index2"])</span> |
| <span class="sd"> >>> ps.read_csv(</span> |
| <span class="sd"> ... path=r'%s/to_csv/bar.csv' % path, index_col=["index1", "index2"]</span> |
| <span class="sd"> ... ).sort_values(by="date") # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> date</span> |
| <span class="sd"> index1 index2</span> |
| <span class="sd"> ... ... 2012-01-31 12:00:00</span> |
| <span class="sd"> ... ... 2012-02-29 12:00:00</span> |
| <span class="sd"> ... ... 2012-03-31 12:00:00</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="c1"># If path is none, just collect and use pandas's to_csv.</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span> |
| <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sep</span><span class="o">=</span><span class="n">sep</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="o">=</span><span class="n">na_rep</span><span class="p">,</span> |
| <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> |
| <span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span> |
| <span class="n">quotechar</span><span class="o">=</span><span class="n">quotechar</span><span class="p">,</span> |
| <span class="n">date_format</span><span class="o">=</span><span class="n">date_format</span><span class="p">,</span> |
| <span class="n">escapechar</span><span class="o">=</span><span class="n">escapechar</span><span class="p">,</span> |
| <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">):</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">col</span><span class="p">,))</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">index_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">index_cols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_cols</span> <span class="o">=</span> <span class="n">index_col</span> |
| |
| <span class="k">if</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"to_csv only support one-level index column now"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">header</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span> |
| <span class="n">new_name</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">new_name</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">header</span><span class="p">))</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| <span class="n">header</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">num_files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"`num_files` has been deprecated and might be removed in a future version. "</span> |
| <span class="s2">"Use `DataFrame.spark.repartition` instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_files</span><span class="p">)</span> |
| |
| <span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="n">builder</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">_set_opts</span><span class="p">(</span> |
| <span class="n">sep</span><span class="o">=</span><span class="n">sep</span><span class="p">,</span> |
| <span class="n">nullValue</span><span class="o">=</span><span class="n">na_rep</span><span class="p">,</span> |
| <span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span> |
| <span class="n">quote</span><span class="o">=</span><span class="n">quotechar</span><span class="p">,</span> |
| <span class="n">dateFormat</span><span class="o">=</span><span class="n">date_format</span><span class="p">,</span> |
| <span class="n">charToEscapeQuoteEscaping</span><span class="o">=</span><span class="n">escapechar</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"csv"</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">compression</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"uncompressed"</span><span class="p">,</span> |
| <span class="n">num_files</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"w"</span><span class="p">,</span> |
| <span class="n">orient</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"records"</span><span class="p">,</span> |
| <span class="n">lines</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert the object to a JSON string.</span> |
| |
| <span class="sd"> .. note:: pandas-on-Spark `to_json` writes files to a path or URI. Unlike pandas',</span> |
| <span class="sd"> pandas-on-Spark respects HDFS's property such as 'fs.default.name'.</span> |
| |
| <span class="sd"> .. note:: pandas-on-Spark writes JSON files into the directory, `path`, and writes</span> |
| <span class="sd"> multiple `part-...` files in the directory when `path` is specified.</span> |
| <span class="sd"> This behaviour was inherited from Apache Spark. The number of files can</span> |
| <span class="sd"> be controlled by `num_files`.</span> |
| |
| <span class="sd"> .. note:: output JSON format is different from pandas'. It always use `orient='records'`</span> |
| <span class="sd"> for its output. This behaviour might have to change in the near future.</span> |
| |
| <span class="sd"> .. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values</span> |
| <span class="sd"> when writing JSON objects. It works only when `path` is provided.</span> |
| |
| <span class="sd"> Note NaN's and None will be converted to null and datetime objects</span> |
| <span class="sd"> will be converted to UNIX timestamps.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : string, optional</span> |
| <span class="sd"> File path. If not specified, the result is returned as</span> |
| <span class="sd"> a string.</span> |
| <span class="sd"> lines : bool, default True</span> |
| <span class="sd"> If ‘orient’ is ‘records’ write out line delimited json format.</span> |
| <span class="sd"> Will throw ValueError if incorrect ‘orient’ since others are not</span> |
| <span class="sd"> list like. It should be always True for now.</span> |
| <span class="sd"> orient : str, default 'records'</span> |
| <span class="sd"> It should be always 'records' for now.</span> |
| <span class="sd"> compression : {'gzip', 'bz2', 'xz', None}</span> |
| <span class="sd"> A string representing the compression to use in the output file,</span> |
| <span class="sd"> only used when the first argument is a filename. By default, the</span> |
| <span class="sd"> compression is inferred from the filename.</span> |
| <span class="sd"> num_files : the number of files to be written in `path` directory when</span> |
| <span class="sd"> this is a path.</span> |
| <span class="sd"> mode : str</span> |
| <span class="sd"> Python write mode, default 'w'.</span> |
| |
| <span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span> |
| <span class="sd"> Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.</span> |
| |
| <span class="sd"> - 'append' (equivalent to 'a'): Append the new data to existing data.</span> |
| <span class="sd"> - 'overwrite' (equivalent to 'w'): Overwrite existing data.</span> |
| <span class="sd"> - 'ignore': Silently ignore this operation if data already exists.</span> |
| <span class="sd"> - 'error' or 'errorifexists': Throw an exception if data already exists.</span> |
| |
| <span class="sd"> partition_cols : str or list of str, optional, default None</span> |
| <span class="sd"> Names of partitioning columns</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Column names to be used in Spark to represent pandas-on-Spark's index. The index name</span> |
| <span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span> |
| <span class="sd"> options: keyword arguments for additional options specific to PySpark.</span> |
| <span class="sd"> It is specific to PySpark's JSON options to pass. Check</span> |
| <span class="sd"> the options in PySpark's API documentation for `spark.write.json(...)`.</span> |
| <span class="sd"> It has a higher priority and overwrites all other options.</span> |
| <span class="sd"> This parameter only works when `path` is specified.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str or None</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([['a', 'b'], ['c', 'd']],</span> |
| <span class="sd"> ... columns=['col 1', 'col 2'])</span> |
| <span class="sd"> >>> df.to_json()</span> |
| <span class="sd"> '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'</span> |
| |
| <span class="sd"> >>> df['col 1'].to_json()</span> |
| <span class="sd"> '[{"col 1":"a"},{"col 1":"c"}]'</span> |
| |
| <span class="sd"> >>> df.to_json(path=r'%s/to_json/foo.json' % path, num_files=1)</span> |
| <span class="sd"> >>> ps.read_json(</span> |
| <span class="sd"> ... path=r'%s/to_json/foo.json' % path</span> |
| <span class="sd"> ... ).sort_values(by="col 1")</span> |
| <span class="sd"> col 1 col 2</span> |
| <span class="sd"> 0 a b</span> |
| <span class="sd"> 1 c d</span> |
| |
| <span class="sd"> >>> df['col 1'].to_json(path=r'%s/to_json/foo.json' % path, num_files=1, index_col="index")</span> |
| <span class="sd"> >>> ps.read_json(</span> |
| <span class="sd"> ... path=r'%s/to_json/foo.json' % path, index_col="index"</span> |
| <span class="sd"> ... ).sort_values(by="col 1") # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> col 1</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 c</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> |
| |
| <span class="n">default_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="s2">"ignoreNullFields"</span><span class="p">:</span> <span class="kc">False</span><span class="p">}</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">default_options</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">}</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">lines</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"lines=False is not implemented yet."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">orient</span> <span class="o">!=</span> <span class="s2">"records"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"orient='records' is supported only for now."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="c1"># If path is none, just collect and use pandas's to_json.</span> |
| <span class="n">psdf_or_ser</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf_or_ser</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="c1"># To make the format consistent and readable by `read_json`, convert it to pandas' and</span> |
| <span class="c1"># use 'records' orient for now.</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"records"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">num_files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"`num_files` has been deprecated and might be removed in a future version. "</span> |
| <span class="s2">"Use `DataFrame.spark.repartition` instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_files</span><span class="p">)</span> |
| |
| <span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="n">builder</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">_set_opts</span><span class="p">(</span><span class="n">compression</span><span class="o">=</span><span class="n">compression</span><span class="p">)</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"json"</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="k">def</span> <span class="nf">to_excel</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">excel_writer</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">],</span> |
| <span class="n">sheet_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"Sheet1"</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">""</span><span class="p">,</span> |
| <span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index_label</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">startrow</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">startcol</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">engine</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">merge_cells</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inf_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"inf"</span><span class="p">,</span> |
| <span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">freeze_panes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Write object to an Excel sheet.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting DataFrame is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> To write a single object to an Excel .xlsx file it is only necessary to</span> |
| <span class="sd"> specify a target file name. To write to multiple sheets it is necessary to</span> |
| <span class="sd"> create an `ExcelWriter` object with a target file name, and specify a sheet</span> |
| <span class="sd"> in the file to write to.</span> |
| |
| <span class="sd"> Multiple sheets may be written to by specifying unique `sheet_name`.</span> |
| <span class="sd"> With all data written to the file it is necessary to save the changes.</span> |
| <span class="sd"> Note that creating an `ExcelWriter` object with a file name that already</span> |
| <span class="sd"> exists will result in the contents of the existing file being erased.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> excel_writer : str or ExcelWriter object</span> |
| <span class="sd"> File path or existing ExcelWriter.</span> |
| <span class="sd"> sheet_name : str, default 'Sheet1'</span> |
| <span class="sd"> Name of sheet which will contain DataFrame.</span> |
| <span class="sd"> na_rep : str, default ''</span> |
| <span class="sd"> Missing data representation.</span> |
| <span class="sd"> float_format : str, optional</span> |
| <span class="sd"> Format string for floating point numbers. For example</span> |
| <span class="sd"> ``float_format="%%.2f"`` will format 0.1234 to 0.12.</span> |
| <span class="sd"> columns : sequence or list of str, optional</span> |
| <span class="sd"> Columns to write.</span> |
| <span class="sd"> header : bool or list of str, default True</span> |
| <span class="sd"> Write out the column names. If a list of string is given it is</span> |
| <span class="sd"> assumed to be aliases for the column names.</span> |
| <span class="sd"> index : bool, default True</span> |
| <span class="sd"> Write row names (index).</span> |
| <span class="sd"> index_label : str or sequence, optional</span> |
| <span class="sd"> Column label for index column(s) if desired. If not specified, and</span> |
| <span class="sd"> `header` and `index` are True, then the index names are used. A</span> |
| <span class="sd"> sequence should be given if the DataFrame uses MultiIndex.</span> |
| <span class="sd"> startrow : int, default 0</span> |
| <span class="sd"> Upper left cell row to dump data frame.</span> |
| <span class="sd"> startcol : int, default 0</span> |
| <span class="sd"> Upper left cell column to dump data frame.</span> |
| <span class="sd"> engine : str, optional</span> |
| <span class="sd"> Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this</span> |
| <span class="sd"> via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and</span> |
| <span class="sd"> ``io.excel.xlsm.writer``.</span> |
| <span class="sd"> merge_cells : bool, default True</span> |
| <span class="sd"> Write MultiIndex and Hierarchical Rows as merged cells.</span> |
| <span class="sd"> encoding : str, optional</span> |
| <span class="sd"> Encoding of the resulting excel file. Only necessary for xlwt,</span> |
| <span class="sd"> other writers support unicode natively.</span> |
| <span class="sd"> inf_rep : str, default 'inf'</span> |
| <span class="sd"> Representation for infinity (there is no native representation for</span> |
| <span class="sd"> infinity in Excel).</span> |
| <span class="sd"> verbose : bool, default True</span> |
| <span class="sd"> Display more information in the error logs.</span> |
| <span class="sd"> freeze_panes : tuple of int (length 2), optional</span> |
| <span class="sd"> Specifies the one-based bottommost row and rightmost column that</span> |
| <span class="sd"> is to be frozen.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Once a workbook has been saved it is not possible write further data</span> |
| <span class="sd"> without rewriting the whole workbook.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_excel : Read Excel file.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Create, write to and save a workbook:</span> |
| |
| <span class="sd"> >>> df1 = ps.DataFrame([['a', 'b'], ['c', 'd']],</span> |
| <span class="sd"> ... index=['row 1', 'row 2'],</span> |
| <span class="sd"> ... columns=['col 1', 'col 2'])</span> |
| <span class="sd"> >>> df1.to_excel("output.xlsx") # doctest: +SKIP</span> |
| |
| <span class="sd"> To specify the sheet name:</span> |
| |
| <span class="sd"> >>> df1.to_excel("output.xlsx") # doctest: +SKIP</span> |
| <span class="sd"> >>> df1.to_excel("output.xlsx",</span> |
| <span class="sd"> ... sheet_name='Sheet_name_1') # doctest: +SKIP</span> |
| |
| <span class="sd"> If you wish to write to more than one sheet in the workbook, it is</span> |
| <span class="sd"> necessary to specify an ExcelWriter object:</span> |
| |
| <span class="sd"> >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP</span> |
| <span class="sd"> ... df1.to_excel(writer, sheet_name='Sheet_name_1')</span> |
| <span class="sd"> ... df2.to_excel(writer, sheet_name='Sheet_name_2')</span> |
| |
| <span class="sd"> To set the library that is used to write the Excel file,</span> |
| <span class="sd"> you can pass the `engine` keyword (the default engine is</span> |
| <span class="sd"> automatically chosen depending on the file extension):</span> |
| |
| <span class="sd"> >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"`to_excel` loads all data into the driver's memory. "</span> |
| <span class="s2">"It should only be used if the resulting DataFrame is expected to be small."</span> |
| <span class="p">)</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_excel</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_excel</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Constructor expects DataFrame or Series; however, "</span> <span class="s2">"got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="p">,)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_excel</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the mean of the values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> mean : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.mean()</span> |
| <span class="sd"> a 2.0</span> |
| <span class="sd"> b 0.2</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.mean(axis=1)</span> |
| <span class="sd"> 0 0.55</span> |
| <span class="sd"> 1 1.10</span> |
| <span class="sd"> 2 1.65</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].mean()</span> |
| <span class="sd"> 2.0</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">mean</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"mean"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the sum of the values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| <span class="sd"> min_count : int, default 0</span> |
| <span class="sd"> The required number of valid values to perform the operation. If fewer than</span> |
| <span class="sd"> ``min_count`` non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sum : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, np.nan, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.sum()</span> |
| <span class="sd"> a 6.0</span> |
| <span class="sd"> b 0.4</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.sum(axis=1)</span> |
| <span class="sd"> 0 1.1</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.3</span> |
| <span class="sd"> 3 0.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.sum(min_count=3)</span> |
| <span class="sd"> a 6.0</span> |
| <span class="sd"> b NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.sum(axis=1, min_count=1)</span> |
| <span class="sd"> 0 1.1</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.3</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].sum()</span> |
| <span class="sd"> 6.0</span> |
| |
| <span class="sd"> >>> df['a'].sum(min_count=3)</span> |
| <span class="sd"> 6.0</span> |
| <span class="sd"> >>> df['b'].sum(min_count=3)</span> |
| <span class="sd"> nan</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">spark_column</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="nb">sum</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"sum"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">product</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the product of the values.</span> |
| |
| <span class="sd"> .. note:: unlike pandas', pandas-on-Spark's emulates product by ``exp(sum(log(...)))``</span> |
| <span class="sd"> trick. Therefore, it only works for positive numbers.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| <span class="sd"> min_count : int, default 0</span> |
| <span class="sd"> The required number of valid values to perform the operation. If fewer than</span> |
| <span class="sd"> ``min_count`` non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> Non-numeric type column is not included to the result.</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'B': [10, 20, 30, 40, 50],</span> |
| <span class="sd"> ... 'C': ['a', 'b', 'c', 'd', 'e']})</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 10 a</span> |
| <span class="sd"> 1 2 20 b</span> |
| <span class="sd"> 2 3 30 c</span> |
| <span class="sd"> 3 4 40 d</span> |
| <span class="sd"> 4 5 50 e</span> |
| |
| <span class="sd"> >>> psdf.prod()</span> |
| <span class="sd"> A 120</span> |
| <span class="sd"> B 12000000</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> If there is no numeric type columns, returns empty Series.</span> |
| |
| <span class="sd"> >>> ps.DataFrame({"key": ['a', 'b', 'c'], "val": ['x', 'y', 'z']}).prod()</span> |
| <span class="sd"> Series([], dtype: float64)</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> ps.Series([1, 2, 3, 4, 5]).prod()</span> |
| <span class="sd"> 120</span> |
| |
| <span class="sd"> By default, the product of an empty or all-NA Series is ``1``</span> |
| |
| <span class="sd"> >>> ps.Series([]).prod()</span> |
| <span class="sd"> 1.0</span> |
| |
| <span class="sd"> This can be controlled with the ``min_count`` parameter</span> |
| |
| <span class="sd"> >>> ps.Series([]).prod(min_count=1)</span> |
| <span class="sd"> nan</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">def</span> <span class="nf">prod</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">spark_column</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="n">num_zeros</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">spark_column</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> |
| <span class="n">sign</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">spark_column</span> <span class="o"><</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">num_zeros</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span> |
| <span class="n">sign</span> <span class="o">*</span> <span class="n">F</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">spark_column</span><span class="p">))))</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">IntegralType</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">1</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">prod</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"prod"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span> |
| <span class="p">)</span> |
| |
| <span class="n">prod</span> <span class="o">=</span> <span class="n">product</span> |
| |
| <span class="k">def</span> <span class="nf">skew</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return unbiased skew normalized by N-1.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> skew : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.skew() # doctest: +SKIP</span> |
| <span class="sd"> a 0.000000e+00</span> |
| <span class="sd"> b -3.319678e-16</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].skew()</span> |
| <span class="sd"> 0.0</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">skew</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">count_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="o">~</span><span class="n">spark_column</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> |
| <span class="c1"># refer to the Pandas implementation 'nanskew'</span> |
| <span class="c1"># https://github.com/pandas-dev/pandas/blob/main/pandas/core/nanops.py#L1152</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">count_scol</span> <span class="o">></span> <span class="mi">2</span><span class="p">,</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">skewness</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| <span class="o">*</span> <span class="n">F</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="mi">1</span> <span class="o">/</span> <span class="n">count_scol</span><span class="p">)</span> |
| <span class="o">*</span> <span class="p">(</span><span class="n">count_scol</span> <span class="o">/</span> <span class="p">(</span><span class="n">count_scol</span> <span class="o">-</span> <span class="mi">2</span><span class="p">)),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">skew</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"skew"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).</span> |
| <span class="sd"> Normalized by N-1.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> kurt : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.kurtosis()</span> |
| <span class="sd"> a -1.5</span> |
| <span class="sd"> b -1.5</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].kurtosis()</span> |
| <span class="sd"> -1.5</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">kurtosis</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">kurtosis</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"kurtosis"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span> |
| <span class="p">)</span> |
| |
| <span class="n">kurt</span> <span class="o">=</span> <span class="n">kurtosis</span> |
| |
| <span class="k">def</span> <span class="nf">min</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the minimum of the values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span> |
| <span class="sd"> pandas compatibility. False is supported; however, the columns should</span> |
| <span class="sd"> be all numeric or all non-numeric.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> min : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.min()</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 0.1</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.min(axis=1)</span> |
| <span class="sd"> 0 0.1</span> |
| <span class="sd"> 1 0.2</span> |
| <span class="sd"> 2 0.3</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].min()</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span> |
| <span class="n">name</span><span class="o">=</span><span class="s2">"min"</span><span class="p">,</span> |
| <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">max</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the maximum of the values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span> |
| <span class="sd"> pandas compatibility. False is supported; however, the columns should</span> |
| <span class="sd"> be all numeric or all non-numeric.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> max : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.max()</span> |
| <span class="sd"> a 3.0</span> |
| <span class="sd"> b 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.max(axis=1)</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].max()</span> |
| <span class="sd"> 3.0</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span> |
| <span class="n">name</span><span class="o">=</span><span class="s2">"max"</span><span class="p">,</span> |
| <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">count</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Count non-NA cells for each column.</span> |
| |
| <span class="sd"> The values `None`, `NaN` are considered NA.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or ‘index’, 1 or ‘columns’}, default 0</span> |
| <span class="sd"> If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are</span> |
| <span class="sd"> generated for each row.</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span> |
| <span class="sd"> pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> max : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.shape: Number of DataFrame rows and columns (including NA</span> |
| <span class="sd"> elements).</span> |
| <span class="sd"> DataFrame.isna: Boolean same-sized DataFrame showing places of NA</span> |
| <span class="sd"> elements.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Constructing DataFrame from a dictionary:</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({"Person":</span> |
| <span class="sd"> ... ["John", "Myla", "Lewis", "John", "Myla"],</span> |
| <span class="sd"> ... "Age": [24., np.nan, 21., 33, 26],</span> |
| <span class="sd"> ... "Single": [False, True, True, True, False]},</span> |
| <span class="sd"> ... columns=["Person", "Age", "Single"])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> Person Age Single</span> |
| <span class="sd"> 0 John 24.0 False</span> |
| <span class="sd"> 1 Myla NaN True</span> |
| <span class="sd"> 2 Lewis 21.0 True</span> |
| <span class="sd"> 3 John 33.0 True</span> |
| <span class="sd"> 4 Myla 26.0 False</span> |
| |
| <span class="sd"> Notice the uncounted NA values:</span> |
| |
| <span class="sd"> >>> df.count()</span> |
| <span class="sd"> Person 5</span> |
| <span class="sd"> Age 4</span> |
| <span class="sd"> Single 5</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> df.count(axis=1)</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['Person'].count()</span> |
| <span class="sd"> 5</span> |
| |
| <span class="sd"> >>> df['Age'].count()</span> |
| <span class="sd"> 4</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"count"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">std</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return sample standard deviation.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> std : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.std()</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 0.1</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.std(axis=1)</span> |
| <span class="sd"> 0 0.636396</span> |
| <span class="sd"> 1 1.272792</span> |
| <span class="sd"> 2 1.909188</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.std(ddof=0)</span> |
| <span class="sd"> a 0.816497</span> |
| <span class="sd"> b 0.081650</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].std()</span> |
| <span class="sd"> 1.0</span> |
| |
| <span class="sd"> >>> df['a'].std(ddof=0)</span> |
| <span class="sd"> 0.816496580927726</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_pop</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_samp</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">std</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"std"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">var</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return unbiased variance.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> var : scalar for a Series, and a Series for a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.var()</span> |
| <span class="sd"> a 1.00</span> |
| <span class="sd"> b 0.01</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.var(axis=1)</span> |
| <span class="sd"> 0 0.405</span> |
| <span class="sd"> 1 1.620</span> |
| <span class="sd"> 2 3.645</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.var(ddof=0)</span> |
| <span class="sd"> a 0.666667</span> |
| <span class="sd"> b 0.006667</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].var()</span> |
| <span class="sd"> 1.0</span> |
| |
| <span class="sd"> >>> df['a'].var(ddof=0)</span> |
| <span class="sd"> 0.6666666666666666</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">var_pop</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">var_samp</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">var</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"var"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">median</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the median of the values for the requested axis.</span> |
| |
| <span class="sd"> .. note:: Unlike pandas', the median in pandas-on-Spark is an approximated median based upon</span> |
| <span class="sd"> approximate percentile computation because computing median across a large dataset</span> |
| <span class="sd"> is extremely expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| <span class="sd"> accuracy : int, optional</span> |
| <span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span> |
| <span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> median : scalar or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'a': [24., 21., 25., 33., 26.], 'b': [1, 2, 3, 4, 5]}, columns=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 24.0 1</span> |
| <span class="sd"> 1 21.0 2</span> |
| <span class="sd"> 2 25.0 3</span> |
| <span class="sd"> 3 33.0 4</span> |
| <span class="sd"> 4 26.0 5</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.median()</span> |
| <span class="sd"> a 25.0</span> |
| <span class="sd"> b 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df['a'].median()</span> |
| <span class="sd"> 25.0</span> |
| <span class="sd"> >>> (df['b'] + 100).median()</span> |
| <span class="sd"> 103.0</span> |
| |
| <span class="sd"> For multi-index columns,</span> |
| |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples([('x', 'a'), ('y', 'b')])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 24.0 1</span> |
| <span class="sd"> 1 21.0 2</span> |
| <span class="sd"> 2 25.0 3</span> |
| <span class="sd"> 3 33.0 4</span> |
| <span class="sd"> 4 26.0 5</span> |
| |
| <span class="sd"> On a DataFrame:</span> |
| |
| <span class="sd"> >>> df.median()</span> |
| <span class="sd"> x a 25.0</span> |
| <span class="sd"> y b 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.median(axis=1)</span> |
| <span class="sd"> 0 12.5</span> |
| <span class="sd"> 1 11.5</span> |
| <span class="sd"> 2 14.0</span> |
| <span class="sd"> 3 18.5</span> |
| <span class="sd"> 4 15.5</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> On a Series:</span> |
| |
| <span class="sd"> >>> df[('x', 'a')].median()</span> |
| <span class="sd"> 25.0</span> |
| <span class="sd"> >>> (df[('y', 'b')] + 100).median()</span> |
| <span class="sd"> 103.0</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="p">(</span><span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">median</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"median"</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">sem</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return unbiased standard error of the mean over requested axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| <span class="sd"> numeric_only : bool, default None</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar(for Series) or Series(for DataFrame)</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 5</span> |
| <span class="sd"> 2 3 6</span> |
| |
| <span class="sd"> >>> psdf.sem()</span> |
| <span class="sd"> a 0.57735</span> |
| <span class="sd"> b 0.57735</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> psdf.sem(ddof=0)</span> |
| <span class="sd"> a 0.471405</span> |
| <span class="sd"> b 0.471405</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> psdf.sem(axis=1)</span> |
| <span class="sd"> 0 1.5</span> |
| <span class="sd"> 1 1.5</span> |
| <span class="sd"> 2 1.5</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Support for Series</span> |
| |
| <span class="sd"> >>> psser = psdf.a</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> Name: a, dtype: int64</span> |
| |
| <span class="sd"> >>> psser.sem()</span> |
| <span class="sd"> 0.5773502691896258</span> |
| |
| <span class="sd"> >>> psser.sem(ddof=0)</span> |
| <span class="sd"> 0.47140452079103173</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_pop</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_samp</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">std</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> <span class="o">/</span> <span class="nb">pow</span><span class="p">(</span><span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">),</span> <span class="mf">0.5</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">sem</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"sem"</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return an int representing the number of elements in this object.</span> |
| |
| <span class="sd"> Return the number of rows if Series. Otherwise return the number of</span> |
| <span class="sd"> rows times number of columns if DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series({'a': 1, 'b': 2, 'c': None})</span> |
| <span class="sd"> >>> s.size</span> |
| <span class="sd"> 3</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'col1': [1, 2, None], 'col2': [3, 4, None]})</span> |
| <span class="sd"> >>> df.size</span> |
| <span class="sd"> 6</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame(index=[1, 2, None])</span> |
| <span class="sd"> >>> df.size</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> """</span> |
| <span class="n">num_columns</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">num_columns</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="mi">0</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">*</span> <span class="n">num_columns</span> <span class="c1"># type: ignore[arg-type]</span> |
| |
| <span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return a Series/DataFrame with absolute numeric value of each element.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> abs : Series/DataFrame containing the absolute value of each element.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> Absolute numeric values in a Series.</span> |
| |
| <span class="sd"> >>> s = ps.Series([-1.10, 2, -3.33, 4])</span> |
| <span class="sd"> >>> s.abs()</span> |
| <span class="sd"> 0 1.10</span> |
| <span class="sd"> 1 2.00</span> |
| <span class="sd"> 2 3.33</span> |
| <span class="sd"> 3 4.00</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Absolute numeric values in a DataFrame.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'a': [4, 5, 6, 7],</span> |
| <span class="sd"> ... 'b': [10, 20, 30, 40],</span> |
| <span class="sd"> ... 'c': [100, 50, -30, -50]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df.abs()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 4 10 100</span> |
| <span class="sd"> 1 5 20 50</span> |
| <span class="sd"> 2 6 30 30</span> |
| <span class="sd"> 3 7 40 50</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Column</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">psser</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span> <span class="n">field</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"bad operand type for abs(): </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">),</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="nb">abs</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: by argument only support the grouping name and as_index only for now. Documentation</span> |
| <span class="c1"># should be updated when it's supported.</span> |
| <span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]]],</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupBy[FrameLike]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Group DataFrame or Series using one or more columns.</span> |
| |
| <span class="sd"> A groupby operation involves some combination of splitting the</span> |
| <span class="sd"> object, applying a function, and combining the results. This can be</span> |
| <span class="sd"> used to group large amounts of data and compute operations on these</span> |
| <span class="sd"> groups.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> by : Series, label, or list of labels</span> |
| <span class="sd"> Used to determine the groups for the groupby.</span> |
| <span class="sd"> If Series is passed, the Series or dict VALUES</span> |
| <span class="sd"> will be used to determine the groups. A label or list of</span> |
| <span class="sd"> labels may be passed to group by the columns in ``self``.</span> |
| <span class="sd"> axis : int, default 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| <span class="sd"> as_index : bool, default True</span> |
| <span class="sd"> For aggregated output, return object with group labels as the</span> |
| <span class="sd"> index. Only relevant for DataFrame input. as_index=False is</span> |
| <span class="sd"> effectively "SQL-style" grouped output.</span> |
| <span class="sd"> dropna : bool, default True</span> |
| <span class="sd"> If True, and if group keys contain NA values,</span> |
| <span class="sd"> NA values together with row/column will be dropped.</span> |
| <span class="sd"> If False, NA values will also be treated as the key in groups.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrameGroupBy or SeriesGroupBy</span> |
| <span class="sd"> Depends on the calling object and returns groupby object that</span> |
| <span class="sd"> contains information about the groups.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.groupby.GroupBy</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'Animal': ['Falcon', 'Falcon',</span> |
| <span class="sd"> ... 'Parrot', 'Parrot'],</span> |
| <span class="sd"> ... 'Max Speed': [380., 370., 24., 26.]},</span> |
| <span class="sd"> ... columns=['Animal', 'Max Speed'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> Animal Max Speed</span> |
| <span class="sd"> 0 Falcon 380.0</span> |
| <span class="sd"> 1 Falcon 370.0</span> |
| <span class="sd"> 2 Parrot 24.0</span> |
| <span class="sd"> 3 Parrot 26.0</span> |
| |
| <span class="sd"> >>> df.groupby(['Animal']).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> Max Speed</span> |
| <span class="sd"> Animal</span> |
| <span class="sd"> Falcon 375.0</span> |
| <span class="sd"> Parrot 25.0</span> |
| |
| <span class="sd"> >>> df.groupby(['Animal'], as_index=False).mean().sort_values('Animal')</span> |
| <span class="sd"> ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> Animal Max Speed</span> |
| <span class="sd"> ...Falcon 375.0</span> |
| <span class="sd"> ...Parrot 25.0</span> |
| |
| <span class="sd"> We can also choose to include NA in group keys or not by setting dropna parameter,</span> |
| <span class="sd"> the default setting is True:</span> |
| |
| <span class="sd"> >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]</span> |
| <span class="sd"> >>> df = ps.DataFrame(l, columns=["a", "b", "c"])</span> |
| <span class="sd"> >>> df.groupby(by=["b"]).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a c</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 1.0 2 3</span> |
| <span class="sd"> 2.0 2 5</span> |
| |
| <span class="sd"> >>> df.groupby(by=["b"], dropna=False).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a c</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 1.0 2 3</span> |
| <span class="sd"> 2.0 2 5</span> |
| <span class="sd"> NaN 1 4</span> |
| <span class="sd"> """</span> |
| <span class="n">new_by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Label</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Grouper for '</span><span class="si">{}</span><span class="s2">' not 1-dimensional"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">by</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">by</span><span class="p">)</span> |
| <span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">by</span><span class="p">)]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">by</span><span class="p">)</span> |
| <span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">by</span><span class="p">,))]</span> |
| <span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="n">new_by</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Grouper for '</span><span class="si">{}</span><span class="s2">' not 1-dimensional"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">key</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">key</span><span class="p">,)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Grouper for '</span><span class="si">{}</span><span class="s2">' not 1-dimensional"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Grouper for '</span><span class="si">{}</span><span class="s2">' not 1-dimensional"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_by</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"No group keys passed!"</span><span class="p">)</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_build_groupby</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">new_by</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_build_groupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupBy[FrameLike]"</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="k">def</span> <span class="nf">bool</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the bool of a single element in the current object.</span> |
| |
| <span class="sd"> This must be a boolean scalar value, either True or False. Raise a ValueError if</span> |
| <span class="sd"> the object does not have exactly 1 element, or that element is not boolean</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.DataFrame({'a': [True]}).bool()</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> >>> ps.Series([False]).bool()</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> If there are non-boolean or multiple values exist, it raises an exception in all</span> |
| <span class="sd"> cases as below.</span> |
| |
| <span class="sd"> >>> ps.DataFrame({'a': ['a']}).bool()</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: bool cannot act on a non-boolean single element DataFrame</span> |
| |
| <span class="sd"> >>> ps.DataFrame({'a': [True], 'b': [False]}).bool() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(),</span> |
| <span class="sd"> a.item(), a.any() or a.all().</span> |
| |
| <span class="sd"> >>> ps.Series([1]).bool()</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: bool cannot act on a non-boolean single element DataFrame</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_dataframe</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"bool() expects DataFrame or Series; however, "</span> <span class="s2">"got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="p">,))</span> |
| <span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">bool</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">first_valid_index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Retrieves the index of the first valid value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar, tuple, or None</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> Support for DataFrame</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [None, 2, 3, 2],</span> |
| <span class="sd"> ... 'b': [None, 2.0, 3.0, 1.0],</span> |
| <span class="sd"> ... 'c': [None, 200, 400, 200]},</span> |
| <span class="sd"> ... index=['Q', 'W', 'E', 'R'])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> Q NaN NaN NaN</span> |
| <span class="sd"> W 2.0 2.0 200.0</span> |
| <span class="sd"> E 3.0 3.0 400.0</span> |
| <span class="sd"> R 2.0 1.0 200.0</span> |
| |
| <span class="sd"> >>> psdf.first_valid_index()</span> |
| <span class="sd"> 'W'</span> |
| |
| <span class="sd"> Support for MultiIndex columns</span> |
| |
| <span class="sd"> >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> x y z</span> |
| <span class="sd"> Q NaN NaN NaN</span> |
| <span class="sd"> W 2.0 2.0 200.0</span> |
| <span class="sd"> E 3.0 3.0 400.0</span> |
| <span class="sd"> R 2.0 1.0 200.0</span> |
| |
| <span class="sd"> >>> psdf.first_valid_index()</span> |
| <span class="sd"> 'W'</span> |
| |
| <span class="sd"> Support for Series.</span> |
| |
| <span class="sd"> >>> s = ps.Series([None, None, 3, 4, 5], index=[100, 200, 300, 400, 500])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 100 NaN</span> |
| <span class="sd"> 200 NaN</span> |
| <span class="sd"> 300 3.0</span> |
| <span class="sd"> 400 4.0</span> |
| <span class="sd"> 500 5.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.first_valid_index()</span> |
| <span class="sd"> 300</span> |
| |
| <span class="sd"> Support for MultiIndex</span> |
| |
| <span class="sd"> >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> s = ps.Series([None, None, None, None, 250, 1.5, 320, 1, 0.3], index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> lama speed NaN</span> |
| <span class="sd"> weight NaN</span> |
| <span class="sd"> length NaN</span> |
| <span class="sd"> cow speed NaN</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.first_valid_index()</span> |
| <span class="sd"> ('cow', 'weight')</span> |
| <span class="sd"> """</span> |
| <span class="n">data_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">data_spark_columns</span><span class="p">))</span> |
| |
| <span class="k">with</span> <span class="n">sql_conf</span><span class="p">({</span><span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">:</span> <span class="kc">False</span><span class="p">}):</span> |
| <span class="c1"># Disable Arrow to keep row ordering.</span> |
| <span class="n">first_valid_row</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># For Empty Series or DataFrame, returns None.</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="n">first_valid_row</span> <span class="o">=</span> <span class="n">first_valid_row</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_valid_row</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">last_valid_index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return index for last non-NA/null value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar, tuple, or None</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API only works with PySpark >= 3.0.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> Support for DataFrame</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1, 2, 3, None],</span> |
| <span class="sd"> ... 'b': [1.0, 2.0, 3.0, None],</span> |
| <span class="sd"> ... 'c': [100, 200, 400, None]},</span> |
| <span class="sd"> ... index=['Q', 'W', 'E', 'R'])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> Q 1.0 1.0 100.0</span> |
| <span class="sd"> W 2.0 2.0 200.0</span> |
| <span class="sd"> E 3.0 3.0 400.0</span> |
| <span class="sd"> R NaN NaN NaN</span> |
| |
| <span class="sd"> >>> psdf.last_valid_index() # doctest: +SKIP</span> |
| <span class="sd"> 'E'</span> |
| |
| <span class="sd"> Support for MultiIndex columns</span> |
| |
| <span class="sd"> >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> x y z</span> |
| <span class="sd"> Q 1.0 1.0 100.0</span> |
| <span class="sd"> W 2.0 2.0 200.0</span> |
| <span class="sd"> E 3.0 3.0 400.0</span> |
| <span class="sd"> R NaN NaN NaN</span> |
| |
| <span class="sd"> >>> psdf.last_valid_index() # doctest: +SKIP</span> |
| <span class="sd"> 'E'</span> |
| |
| <span class="sd"> Support for Series.</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, None, None], index=[100, 200, 300, 400, 500])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 100 1.0</span> |
| <span class="sd"> 200 2.0</span> |
| <span class="sd"> 300 3.0</span> |
| <span class="sd"> 400 NaN</span> |
| <span class="sd"> 500 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.last_valid_index() # doctest: +SKIP</span> |
| <span class="sd"> 300</span> |
| |
| <span class="sd"> Support for MultiIndex</span> |
| |
| <span class="sd"> >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> s = ps.Series([250, 1.5, 320, 1, 0.3, None, None, None, None], index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> lama speed 250.0</span> |
| <span class="sd"> weight 1.5</span> |
| <span class="sd"> length 320.0</span> |
| <span class="sd"> cow speed 1.0</span> |
| <span class="sd"> weight 0.3</span> |
| <span class="sd"> length NaN</span> |
| <span class="sd"> falcon speed NaN</span> |
| <span class="sd"> weight NaN</span> |
| <span class="sd"> length NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.last_valid_index() # doctest: +SKIP</span> |
| <span class="sd"> ('cow', 'weight')</span> |
| <span class="sd"> """</span> |
| <span class="n">data_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">data_spark_columns</span><span class="p">))</span> |
| |
| <span class="n">last_valid_rows</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># For Empty Series or DataFrame, returns None.</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">last_valid_rows</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="n">last_valid_row</span> <span class="o">=</span> <span class="n">last_valid_rows</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">last_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">last_valid_row</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">last_valid_row</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: 'center', 'win_type', 'on', 'axis' parameter should be implemented.</span> |
| <span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Rolling[FrameLike]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Provide rolling transformations.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> in the near future.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> window : int, or offset</span> |
| <span class="sd"> Size of the moving window.</span> |
| <span class="sd"> This is the number of observations used for calculating the statistic.</span> |
| <span class="sd"> Each window will be a fixed size.</span> |
| |
| <span class="sd"> min_periods : int, default None</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| <span class="sd"> For a window that is specified by an offset, min_periods will default to 1.</span> |
| <span class="sd"> Otherwise, min_periods will default to the size of the window.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> a Window sub-classed for the particular operation</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Rolling</span> |
| |
| <span class="k">return</span> <span class="n">Rolling</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="o">=</span><span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: 'center' and 'axis' parameter should be implemented.</span> |
| <span class="c1"># 'axis' implementation, refer https://github.com/pyspark.pandas/pull/607</span> |
| <span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Expanding[FrameLike]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Provide expanding transformations.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> in the near future.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> min_periods : int, default 1</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> a Window sub-classed for the particular operation</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Expanding</span> |
| |
| <span class="k">return</span> <span class="n">Expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Get item from object for given key (DataFrame column, Panel slice,</span> |
| <span class="sd"> etc.). Returns default value if not found.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> key : object</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> value : same type as items contained in object</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'x':range(3), 'y':['a','b','b'], 'z':['a','b','b']},</span> |
| <span class="sd"> ... columns=['x', 'y', 'z'], index=[10, 20, 20])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> x y z</span> |
| <span class="sd"> 10 0 a a</span> |
| <span class="sd"> 20 1 b b</span> |
| <span class="sd"> 20 2 b b</span> |
| |
| <span class="sd"> >>> df.get('x')</span> |
| <span class="sd"> 10 0</span> |
| <span class="sd"> 20 1</span> |
| <span class="sd"> 20 2</span> |
| <span class="sd"> Name: x, dtype: int64</span> |
| |
| <span class="sd"> >>> df.get(['x', 'y'])</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> 10 0 a</span> |
| <span class="sd"> 20 1 b</span> |
| <span class="sd"> 20 2 b</span> |
| |
| <span class="sd"> >>> df.x.get(10)</span> |
| <span class="sd"> 0</span> |
| |
| <span class="sd"> >>> df.x.get(20)</span> |
| <span class="sd"> 20 1</span> |
| <span class="sd"> 20 2</span> |
| <span class="sd"> Name: x, dtype: int64</span> |
| |
| <span class="sd"> >>> df.x.get(15, -1)</span> |
| <span class="sd"> -1</span> |
| <span class="sd"> """</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> |
| <span class="k">except</span> <span class="p">(</span><span class="ne">KeyError</span><span class="p">,</span> <span class="ne">ValueError</span><span class="p">,</span> <span class="ne">IndexError</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">default</span> |
| |
| <span class="k">def</span> <span class="nf">squeeze</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"DataFrame"</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Squeeze 1 dimensional axis objects into scalars.</span> |
| |
| <span class="sd"> Series or DataFrames with a single element are squeezed to a scalar.</span> |
| <span class="sd"> DataFrames with a single column or a single row are squeezed to a</span> |
| <span class="sd"> Series. Otherwise the object is unchanged.</span> |
| |
| <span class="sd"> This method is most useful when you don't know if your</span> |
| <span class="sd"> object is a Series or DataFrame, but you do know it has just a single</span> |
| <span class="sd"> column. In that case you can safely call `squeeze` to ensure you have a</span> |
| <span class="sd"> Series.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns', None}, default None</span> |
| <span class="sd"> A specific axis to squeeze. By default, all length-1 axes are</span> |
| <span class="sd"> squeezed.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame, Series, or scalar</span> |
| <span class="sd"> The projection after squeezing `axis` or all the axes.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.iloc : Integer-location based indexing for selecting scalars.</span> |
| <span class="sd"> DataFrame.iloc : Integer-location based indexing for selecting Series.</span> |
| <span class="sd"> Series.to_frame : Inverse of DataFrame.squeeze for a</span> |
| <span class="sd"> single-column DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> primes = ps.Series([2, 3, 5, 7])</span> |
| |
| <span class="sd"> Slicing might produce a Series with a single value:</span> |
| |
| <span class="sd"> >>> even_primes = primes[primes % 2 == 0]</span> |
| <span class="sd"> >>> even_primes</span> |
| <span class="sd"> 0 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> even_primes.squeeze()</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> Squeezing objects with more than one value in every axis does nothing:</span> |
| |
| <span class="sd"> >>> odd_primes = primes[primes % 2 == 1]</span> |
| <span class="sd"> >>> odd_primes</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 5</span> |
| <span class="sd"> 3 7</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> odd_primes.squeeze()</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 5</span> |
| <span class="sd"> 3 7</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Squeezing is even more effective when used with DataFrames.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 1 3 4</span> |
| |
| <span class="sd"> Slicing a single column will produce a DataFrame with the columns</span> |
| <span class="sd"> having only one value:</span> |
| |
| <span class="sd"> >>> df_a = df[['a']]</span> |
| <span class="sd"> >>> df_a</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 3</span> |
| |
| <span class="sd"> So the columns can be squeezed down, resulting in a Series:</span> |
| |
| <span class="sd"> >>> df_a.squeeze('columns')</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> Name: a, dtype: int64</span> |
| |
| <span class="sd"> Slicing a single row from a single column will produce a single</span> |
| <span class="sd"> scalar DataFrame:</span> |
| |
| <span class="sd"> >>> df_1a = df.loc[[1], ['a']]</span> |
| <span class="sd"> >>> df_1a</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 3</span> |
| |
| <span class="sd"> Squeezing the rows produces a single scalar Series:</span> |
| |
| <span class="sd"> >>> df_1a.squeeze('rows')</span> |
| <span class="sd"> a 3</span> |
| <span class="sd"> Name: 1, dtype: int64</span> |
| |
| <span class="sd"> Squeezing all axes will project directly into a scalar:</span> |
| |
| <span class="sd"> >>> df_1a.squeeze()</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="s2">"index"</span> <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="s2">"rows"</span> <span class="k">else</span> <span class="n">axis</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">is_squeezable</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">[:</span><span class="mi">2</span><span class="p">])</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="c1"># If DataFrame has multiple columns, there is no change.</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_squeezable</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| <span class="n">series_from_column</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="n">has_single_value</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">series_from_column</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="c1"># If DataFrame has only a single value, use pandas API directly.</span> |
| <span class="k">if</span> <span class="n">has_single_value</span><span class="p">:</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="k">else</span> <span class="n">result</span> |
| <span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">series_from_column</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># The case of Series is simple.</span> |
| <span class="c1"># If Series has only a single value, just return it as a scalar.</span> |
| <span class="c1"># Otherwise, there is no change.</span> |
| <span class="n">self_top_two</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="s2">"Series"</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> |
| <span class="n">has_single_value</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">self_top_two</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">self_top_two</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">has_single_value</span> <span class="k">else</span> <span class="bp">self</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">truncate</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">before</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">after</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">copy</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameOrSeries</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Truncate a Series or DataFrame before and after some index value.</span> |
| |
| <span class="sd"> This is a useful shorthand for boolean indexing based on index</span> |
| <span class="sd"> values above or below certain thresholds.</span> |
| |
| <span class="sd"> .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`</span> |
| <span class="sd"> which can be expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> before : date, str, int</span> |
| <span class="sd"> Truncate all rows before this index value.</span> |
| <span class="sd"> after : date, str, int</span> |
| <span class="sd"> Truncate all rows after this index value.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, optional</span> |
| <span class="sd"> Axis to truncate. Truncates the index (rows) by default.</span> |
| <span class="sd"> copy : bool, default is True,</span> |
| <span class="sd"> Return a copy of the truncated section.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> type of caller</span> |
| <span class="sd"> The truncated Series or DataFrame.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.loc : Select a subset of a DataFrame by label.</span> |
| <span class="sd"> DataFrame.iloc : Select a subset of a DataFrame by position.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],</span> |
| <span class="sd"> ... 'B': ['f', 'g', 'h', 'i', 'j'],</span> |
| <span class="sd"> ... 'C': ['k', 'l', 'm', 'n', 'o']},</span> |
| <span class="sd"> ... index=[1, 2, 3, 4, 5])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 1 a f k</span> |
| <span class="sd"> 2 b g l</span> |
| <span class="sd"> 3 c h m</span> |
| <span class="sd"> 4 d i n</span> |
| <span class="sd"> 5 e j o</span> |
| |
| <span class="sd"> >>> df.truncate(before=2, after=4)</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 2 b g l</span> |
| <span class="sd"> 3 c h m</span> |
| <span class="sd"> 4 d i n</span> |
| |
| <span class="sd"> The columns of a DataFrame can be truncated.</span> |
| |
| <span class="sd"> >>> df.truncate(before="A", after="B", axis="columns")</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 a f</span> |
| <span class="sd"> 2 b g</span> |
| <span class="sd"> 3 c h</span> |
| <span class="sd"> 4 d i</span> |
| <span class="sd"> 5 e j</span> |
| |
| <span class="sd"> For Series, only rows can be truncated.</span> |
| |
| <span class="sd"> >>> df['A'].truncate(before=2, after=4)</span> |
| <span class="sd"> 2 b</span> |
| <span class="sd"> 3 c</span> |
| <span class="sd"> 4 d</span> |
| <span class="sd"> Name: A, dtype: object</span> |
| |
| <span class="sd"> A Series has index that sorted integers.</span> |
| |
| <span class="sd"> >>> s = ps.Series([10, 20, 30, 40, 50, 60, 70],</span> |
| <span class="sd"> ... index=[1, 2, 3, 4, 5, 6, 7])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 1 10</span> |
| <span class="sd"> 2 20</span> |
| <span class="sd"> 3 30</span> |
| <span class="sd"> 4 40</span> |
| <span class="sd"> 5 50</span> |
| <span class="sd"> 6 60</span> |
| <span class="sd"> 7 70</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.truncate(2, 5)</span> |
| <span class="sd"> 2 20</span> |
| <span class="sd"> 3 30</span> |
| <span class="sd"> 4 40</span> |
| <span class="sd"> 5 50</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> A Series has index that sorted strings.</span> |
| |
| <span class="sd"> >>> s = ps.Series([10, 20, 30, 40, 50, 60, 70],</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> a 10</span> |
| <span class="sd"> b 20</span> |
| <span class="sd"> c 30</span> |
| <span class="sd"> d 40</span> |
| <span class="sd"> e 50</span> |
| <span class="sd"> f 60</span> |
| <span class="sd"> g 70</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.truncate('b', 'e')</span> |
| <span class="sd"> b 20</span> |
| <span class="sd"> c 30</span> |
| <span class="sd"> d 40</span> |
| <span class="sd"> e 50</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="n">indexes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span> |
| <span class="n">indexes_increasing</span> <span class="o">=</span> <span class="n">indexes</span><span class="o">.</span><span class="n">is_monotonic_increasing</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">indexes_increasing</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">indexes</span><span class="o">.</span><span class="n">is_monotonic_decreasing</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"truncate requires a sorted index"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">before</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span><span class="n">after</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">before</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">after</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="n">before</span> <span class="o">></span> <span class="n">after</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Truncate: </span><span class="si">%s</span><span class="s2"> must be after </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">after</span><span class="p">,</span> <span class="n">before</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">indexes_increasing</span><span class="p">:</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[arg-type, assignment]</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">after</span><span class="p">:</span><span class="n">before</span><span class="p">]</span> <span class="c1"># type: ignore[arg-type,assignment]</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">indexes_increasing</span><span class="p">:</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">after</span><span class="p">:</span><span class="n">before</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">result</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="n">result</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">to_markdown</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Print Series or DataFrame in Markdown-friendly format.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> buf : writable buffer, defaults to sys.stdout</span> |
| <span class="sd"> Where to send the output. By default, the output is printed to</span> |
| <span class="sd"> sys.stdout. Pass a writable buffer if you need to further process</span> |
| <span class="sd"> the output.</span> |
| <span class="sd"> mode : str, optional</span> |
| <span class="sd"> Mode in which file is opened.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> These parameters will be passed to `tabulate`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str</span> |
| <span class="sd"> Series or DataFrame in Markdown-friendly format.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")</span> |
| <span class="sd"> >>> print(psser.to_markdown()) # doctest: +SKIP</span> |
| <span class="sd"> | | animal |</span> |
| <span class="sd"> |---:|:---------|</span> |
| <span class="sd"> | 0 | elk |</span> |
| <span class="sd"> | 1 | pig |</span> |
| <span class="sd"> | 2 | dog |</span> |
| <span class="sd"> | 3 | quetzal |</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame(</span> |
| <span class="sd"> ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> print(psdf.to_markdown()) # doctest: +SKIP</span> |
| <span class="sd"> | | animal_1 | animal_2 |</span> |
| <span class="sd"> |---:|:-----------|:-----------|</span> |
| <span class="sd"> | 0 | elk | dog |</span> |
| <span class="sd"> | 1 | pig | quetzal |</span> |
| <span class="sd"> """</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"`to_markdown` loads all data into the driver's memory. "</span> |
| <span class="s2">"It should only be used if the resulting pandas object is expected to be small."</span> |
| <span class="p">)</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="n">internal_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">internal_pandas</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">internal_pandas</span><span class="p">)</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="c1"># TODO: add 'downcast' when value parameter exists</span> |
| <span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`bfill```.</span> |
| |
| <span class="sd"> .. note:: the current implementation of 'bfill' uses Spark's Window</span> |
| <span class="sd"> without specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| <span class="sd"> DataFrame or Series with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [None, 3, None, None],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 NaN 2.0 NaN 0</span> |
| <span class="sd"> 1 3.0 4.0 NaN 1</span> |
| <span class="sd"> 2 NaN NaN NaN 5</span> |
| <span class="sd"> 3 NaN 3.0 1.0 4</span> |
| |
| <span class="sd"> Propagate non-null values backward.</span> |
| |
| <span class="sd"> >>> psdf.bfill()</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 3.0 2.0 1.0 0</span> |
| <span class="sd"> 1 3.0 4.0 1.0 1</span> |
| <span class="sd"> 2 NaN 3.0 1.0 5</span> |
| <span class="sd"> 3 NaN 3.0 1.0 4</span> |
| |
| <span class="sd"> For Series</span> |
| |
| <span class="sd"> >>> psser = ps.Series([None, None, None, 1])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 NaN</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> psser.bfill()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"bfill"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span> |
| |
| <span class="n">backfill</span> <span class="o">=</span> <span class="n">bfill</span> |
| |
| <span class="c1"># TODO: add 'downcast' when value parameter exists</span> |
| <span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`ffill```.</span> |
| |
| <span class="sd"> .. note:: the current implementation of 'ffill' uses Spark's Window</span> |
| <span class="sd"> without specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| <span class="sd"> DataFrame or Series with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [None, 3, None, None],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 NaN 2.0 NaN 0</span> |
| <span class="sd"> 1 3.0 4.0 NaN 1</span> |
| <span class="sd"> 2 NaN NaN NaN 5</span> |
| <span class="sd"> 3 NaN 3.0 1.0 4</span> |
| |
| <span class="sd"> Propagate non-null values forward.</span> |
| |
| <span class="sd"> >>> psdf.ffill()</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 NaN 2.0 NaN 0</span> |
| <span class="sd"> 1 3.0 4.0 NaN 1</span> |
| <span class="sd"> 2 3.0 4.0 NaN 5</span> |
| <span class="sd"> 3 3.0 3.0 1.0 4</span> |
| |
| <span class="sd"> For Series</span> |
| |
| <span class="sd"> >>> psser = ps.Series([2, 4, None, 3])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 4.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> psser.ffill()</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 4.0</span> |
| <span class="sd"> 2 4.0</span> |
| <span class="sd"> 3 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"ffill"</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span> |
| |
| <span class="n">pad</span> <span class="o">=</span> <span class="n">ffill</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">at</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">AtIndexer</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">AtIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="n">at</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">AtIndexer</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">iat</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">iAtIndexer</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">iAtIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="n">iat</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">iAtIndexer</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">iloc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">iLocIndexer</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">iLocIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="n">iloc</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">iLocIndexer</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">loc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">LocIndexer</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">LocIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="n">loc</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">LocIndexer</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="k">def</span> <span class="fm">__bool__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">NoReturn</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"The truth value of a </span><span class="si">{0}</span><span class="s2"> is ambiguous. "</span> |
| <span class="s2">"Use a.empty, a.bool(), a.item(), a.any() or a.all()."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">shutil</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">tempfile</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.pandas.generic</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">generic</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"ps"</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"pyspark.pandas.generic tests"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| |
| <span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"path"</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span> |
| |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">generic</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |