| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.pandas.series — PySpark 3.3.0 documentation</title> |
| |
| <link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.pandas.series</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">A wrapper class for Spark Column to behave similar to pandas Series.</span> |
| <span class="sd">"""</span> |
| <span class="kn">import</span> <span class="nn">datetime</span> |
| <span class="kn">import</span> <span class="nn">re</span> |
| <span class="kn">import</span> <span class="nn">inspect</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Mapping</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">partial</span><span class="p">,</span> <span class="n">reduce</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Generic</span><span class="p">,</span> |
| <span class="n">IO</span><span class="p">,</span> |
| <span class="n">Iterable</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Sequence</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Type</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">no_type_check</span><span class="p">,</span> |
| <span class="n">overload</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.accessor</span> <span class="kn">import</span> <span class="n">CachedAccessor</span> |
| <span class="kn">from</span> <span class="nn">pandas.io.formats.printing</span> <span class="kn">import</span> <span class="n">pprint_thing</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="n">is_list_like</span><span class="p">,</span> |
| <span class="n">is_hashable</span><span class="p">,</span> |
| <span class="n">CategoricalDtype</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pandas.tseries.frequencies</span> <span class="kn">import</span> <span class="n">DateOffset</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">ArrayType</span><span class="p">,</span> |
| <span class="n">BooleanType</span><span class="p">,</span> |
| <span class="n">DecimalType</span><span class="p">,</span> |
| <span class="n">DoubleType</span><span class="p">,</span> |
| <span class="n">FloatType</span><span class="p">,</span> |
| <span class="n">IntegerType</span><span class="p">,</span> |
| <span class="n">IntegralType</span><span class="p">,</span> |
| <span class="n">LongType</span><span class="p">,</span> |
| <span class="n">NumericType</span><span class="p">,</span> |
| <span class="n">Row</span><span class="p">,</span> |
| <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">TimestampType</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.window</span> <span class="kn">import</span> <span class="n">Window</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Scalar</span><span class="p">,</span> <span class="n">T</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.accessors</span> <span class="kn">import</span> <span class="n">PandasOnSparkSeriesMethods</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.categorical</span> <span class="kn">import</span> <span class="n">CategoricalAccessor</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.base</span> <span class="kn">import</span> <span class="n">IndexOpsMixin</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.exceptions</span> <span class="kn">import</span> <span class="n">SparkPandasIndexingError</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.generic</span> <span class="kn">import</span> <span class="n">Frame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">InternalField</span><span class="p">,</span> |
| <span class="n">InternalFrame</span><span class="p">,</span> |
| <span class="n">DEFAULT_SERIES_NAME</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.missing.series</span> <span class="kn">import</span> <span class="n">MissingPandasLikeSeries</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.plot</span> <span class="kn">import</span> <span class="n">PandasOnSparkPlotAccessor</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.ml</span> <span class="kn">import</span> <span class="n">corr</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">combine_frames</span><span class="p">,</span> |
| <span class="n">is_name_like_tuple</span><span class="p">,</span> |
| <span class="n">is_name_like_value</span><span class="p">,</span> |
| <span class="n">name_like_string</span><span class="p">,</span> |
| <span class="n">same_anchor</span><span class="p">,</span> |
| <span class="n">scol_for</span><span class="p">,</span> |
| <span class="n">sql_conf</span><span class="p">,</span> |
| <span class="n">validate_arguments_and_invoke_function</span><span class="p">,</span> |
| <span class="n">validate_axis</span><span class="p">,</span> |
| <span class="n">validate_bool_kwarg</span><span class="p">,</span> |
| <span class="n">verify_temp_column_name</span><span class="p">,</span> |
| <span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">,</span> |
| <span class="n">log_advice</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.datetimes</span> <span class="kn">import</span> <span class="n">DatetimeMethods</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark.accessors</span> <span class="kn">import</span> <span class="n">SparkSeriesMethods</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.strings</span> <span class="kn">import</span> <span class="n">StringMethods</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">infer_return_type</span><span class="p">,</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">,</span> |
| <span class="n">ScalarType</span><span class="p">,</span> |
| <span class="n">SeriesType</span><span class="p">,</span> |
| <span class="n">create_type_for_series_type</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="n">ColumnOrName</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">SeriesGroupBy</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark.accessors</span> <span class="kn">import</span> <span class="n">SparkIndexOpsMethods</span> |
| |
| <span class="c1"># This regular expression pattern is complied and defined here to avoid to compile the same</span> |
| <span class="c1"># pattern every time it is used in _repr_ in Series.</span> |
| <span class="c1"># This pattern basically seeks the footer string from pandas'</span> |
| <span class="n">REPR_PATTERN</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"Length: (?P<length>[0-9]+)"</span><span class="p">)</span> |
| |
| <span class="n">_flex_doc_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Return </span><span class="si">{desc}</span><span class="s2"> of series and other, element-wise (binary operator `</span><span class="si">{op_name}</span><span class="s2">`).</span> |
| |
| <span class="s2">Equivalent to ``</span><span class="si">{equiv}</span><span class="s2">``</span> |
| |
| <span class="s2">Parameters</span> |
| <span class="s2">----------</span> |
| <span class="s2">other : Series or scalar value</span> |
| |
| <span class="s2">Returns</span> |
| <span class="s2">-------</span> |
| <span class="s2">Series</span> |
| <span class="s2"> The result of the operation.</span> |
| |
| <span class="s2">See Also</span> |
| <span class="s2">--------</span> |
| <span class="s2">Series.</span><span class="si">{reverse}</span><span class="s2"></span> |
| |
| <span class="si">{series_examples}</span><span class="s2"></span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_add_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.add(df.b)</span> |
| <span class="s2">a 4.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 6.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.radd(df.b)</span> |
| <span class="s2">a 4.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 6.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_sub_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.subtract(df.b)</span> |
| <span class="s2">a 0.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 2.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.rsub(df.b)</span> |
| <span class="s2">a 0.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c -2.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_mul_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.multiply(df.b)</span> |
| <span class="s2">a 4.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 8.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.rmul(df.b)</span> |
| <span class="s2">a 4.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 8.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_div_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.divide(df.b)</span> |
| <span class="s2">a 1.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 2.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.rdiv(df.b)</span> |
| <span class="s2">a 1.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 0.5</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_pow_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.pow(df.b)</span> |
| <span class="s2">a 4.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 16.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.rpow(df.b)</span> |
| <span class="s2">a 4.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 16.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_mod_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.mod(df.b)</span> |
| <span class="s2">a 0.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 0.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.rmod(df.b)</span> |
| <span class="s2">a 0.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 2.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="n">_floordiv_example_SERIES</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({'a': [2, 2, 4, np.nan],</span> |
| <span class="s2">... 'b': [2, np.nan, 2, np.nan]},</span> |
| <span class="s2">... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> a b</span> |
| <span class="s2">a 2.0 2.0</span> |
| <span class="s2">b 2.0 NaN</span> |
| <span class="s2">c 4.0 2.0</span> |
| <span class="s2">d NaN NaN</span> |
| |
| <span class="s2">>>> df.a.floordiv(df.b)</span> |
| <span class="s2">a 1.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 2.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| |
| <span class="s2">>>> df.a.rfloordiv(df.b)</span> |
| <span class="s2">a 1.0</span> |
| <span class="s2">b NaN</span> |
| <span class="s2">c 0.0</span> |
| <span class="s2">d NaN</span> |
| <span class="s2">dtype: float64</span> |
| <span class="s2">"""</span> |
| |
| <span class="c1"># Needed to disambiguate Series.str and str type</span> |
| <span class="n">str_type</span> <span class="o">=</span> <span class="nb">str</span> |
| |
| |
| <div class="viewcode-block" id="Series"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.html#pyspark.pandas.Series">[docs]</a><span class="k">class</span> <span class="nc">Series</span><span class="p">(</span><span class="n">Frame</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">,</span> <span class="n">Generic</span><span class="p">[</span><span class="n">T</span><span class="p">]):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> pandas-on-Spark Series that corresponds to pandas Series logically. This holds Spark Column</span> |
| <span class="sd"> internally.</span> |
| |
| <span class="sd"> :ivar _internal: an internal immutable Frame to manage metadata.</span> |
| <span class="sd"> :type _internal: InternalFrame</span> |
| <span class="sd"> :ivar _psdf: Parent's pandas-on-Spark DataFrame</span> |
| <span class="sd"> :type _psdf: ps.DataFrame</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : array-like, dict, or scalar value, pandas Series</span> |
| <span class="sd"> Contains data stored in Series</span> |
| <span class="sd"> Note that if `data` is a pandas Series, other arguments should not be used.</span> |
| <span class="sd"> index : array-like or Index (1d)</span> |
| <span class="sd"> Values must be hashable and have the same length as `data`.</span> |
| <span class="sd"> Non-unique index values are allowed. Will default to</span> |
| <span class="sd"> RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index</span> |
| <span class="sd"> sequence are used, the index will override the keys found in the</span> |
| <span class="sd"> dict.</span> |
| <span class="sd"> dtype : numpy.dtype or None</span> |
| <span class="sd"> If None, dtype will be inferred</span> |
| <span class="sd"> copy : boolean, default False</span> |
| <span class="sd"> Copy input data</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> <span class="c1"># type: ignore[no-untyped-def]</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">fastpath</span><span class="o">=</span><span class="kc">False</span> |
| <span class="p">):</span> |
| <span class="k">assert</span> <span class="n">data</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_anchor</span><span class="p">:</span> <span class="n">DataFrame</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_col_label</span><span class="p">:</span> <span class="n">Label</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">copy</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">fastpath</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_anchor</span> <span class="o">=</span> <span class="n">data</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_col_label</span> <span class="o">=</span> <span class="n">index</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">copy</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">fastpath</span> |
| <span class="n">s</span> <span class="o">=</span> <span class="n">data</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span> |
| <span class="n">data</span><span class="o">=</span><span class="n">data</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="n">copy</span><span class="p">,</span> <span class="n">fastpath</span><span class="o">=</span><span class="n">fastpath</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">s</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">s</span><span class="o">.</span><span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">])</span> |
| <span class="n">anchor</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_anchor</span> <span class="o">=</span> <span class="n">anchor</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_col_label</span> <span class="o">=</span> <span class="n">anchor</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="n">anchor</span><span class="p">,</span> <span class="s2">"_psseries"</span><span class="p">,</span> <span class="p">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">:</span> <span class="bp">self</span><span class="p">})</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_psdf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_anchor</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">select_column</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_column_label</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_col_label</span> |
| |
| <span class="k">def</span> <span class="nf">_update_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="o">==</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> <span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_anchor</span> <span class="o">=</span> <span class="n">psdf</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"_psseries"</span><span class="p">,</span> <span class="p">{</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">:</span> <span class="bp">self</span><span class="p">})</span> |
| |
| <span class="k">def</span> <span class="nf">_with_new_scol</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">field</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">InternalField</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Copy pandas-on-Spark Series with the new Spark Column.</span> |
| |
| <span class="sd"> :param scol: the new Spark Column</span> |
| <span class="sd"> :return: the copied Series</span> |
| <span class="sd"> """</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span> <span class="k">if</span> <span class="n">field</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| |
| <span class="n">spark</span><span class="p">:</span> <span class="s2">"SparkIndexOpsMethods"</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="s2">"spark"</span><span class="p">,</span> <span class="n">SparkSeriesMethods</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dtype</span><span class="p">:</span> |
| <span class="sd">"""Return the dtype object of the underlying data.</span> |
| |
| <span class="sd"> >>> s = ps.Series(list('abc'))</span> |
| <span class="sd"> >>> s.dtype == s.dtypes</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">axes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="s2">"Index"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a list of the row axis labels.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> psser.axes</span> |
| <span class="sd"> [Int64Index([0, 1, 2], dtype='int64')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">]</span> |
| |
| <span class="c1"># Arithmetic Operators</span> |
| <div class="viewcode-block" id="Series.add"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.add.html#pyspark.pandas.Series.add">[docs]</a> <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">+</span> <span class="n">other</span></div> |
| |
| <span class="n">add</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Addition"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"+"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series + other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"radd"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_add_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.radd"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.radd.html#pyspark.pandas.Series.radd">[docs]</a> <span class="k">def</span> <span class="nf">radd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">+</span> <span class="bp">self</span></div> |
| |
| <span class="n">radd</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Addition"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"+"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other + series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"add"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_add_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.div"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.div.html#pyspark.pandas.Series.div">[docs]</a> <span class="k">def</span> <span class="nf">div</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">/</span> <span class="n">other</span></div> |
| |
| <span class="n">div</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Floating division"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series / other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rdiv"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_div_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">divide</span> <span class="o">=</span> <span class="n">div</span> |
| |
| <div class="viewcode-block" id="Series.rdiv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rdiv.html#pyspark.pandas.Series.rdiv">[docs]</a> <span class="k">def</span> <span class="nf">rdiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">/</span> <span class="bp">self</span></div> |
| |
| <span class="n">rdiv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Floating division"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other / series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"div"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_div_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.truediv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.truediv.html#pyspark.pandas.Series.truediv">[docs]</a> <span class="k">def</span> <span class="nf">truediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">/</span> <span class="n">other</span></div> |
| |
| <span class="n">truediv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Floating division"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series / other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rtruediv"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_div_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.rtruediv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rtruediv.html#pyspark.pandas.Series.rtruediv">[docs]</a> <span class="k">def</span> <span class="nf">rtruediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">/</span> <span class="bp">self</span></div> |
| |
| <span class="n">rtruediv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Floating division"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other / series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"truediv"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_div_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.mul"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.mul.html#pyspark.pandas.Series.mul">[docs]</a> <span class="k">def</span> <span class="nf">mul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">*</span> <span class="n">other</span></div> |
| |
| <span class="n">mul</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Multiplication"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"*"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series * other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rmul"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_mul_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">multiply</span> <span class="o">=</span> <span class="n">mul</span> |
| |
| <div class="viewcode-block" id="Series.rmul"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rmul.html#pyspark.pandas.Series.rmul">[docs]</a> <span class="k">def</span> <span class="nf">rmul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">*</span> <span class="bp">self</span></div> |
| |
| <span class="n">rmul</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Multiplication"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"*"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other * series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"mul"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_mul_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.sub"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.sub.html#pyspark.pandas.Series.sub">[docs]</a> <span class="k">def</span> <span class="nf">sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">-</span> <span class="n">other</span></div> |
| |
| <span class="n">sub</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Subtraction"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"-"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series - other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rsub"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_sub_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">subtract</span> <span class="o">=</span> <span class="n">sub</span> |
| |
| <div class="viewcode-block" id="Series.rsub"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rsub.html#pyspark.pandas.Series.rsub">[docs]</a> <span class="k">def</span> <span class="nf">rsub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">-</span> <span class="bp">self</span></div> |
| |
| <span class="n">rsub</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Subtraction"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"-"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other - series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"sub"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_sub_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.mod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.mod.html#pyspark.pandas.Series.mod">[docs]</a> <span class="k">def</span> <span class="nf">mod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">%</span> <span class="n">other</span></div> |
| |
| <span class="n">mod</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Modulo"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"%"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series </span><span class="si">% o</span><span class="s2">ther"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rmod"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_mod_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.rmod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rmod.html#pyspark.pandas.Series.rmod">[docs]</a> <span class="k">def</span> <span class="nf">rmod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">%</span> <span class="bp">self</span></div> |
| |
| <span class="n">rmod</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Modulo"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"%"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other </span><span class="si">% s</span><span class="s2">eries"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"mod"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_mod_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.pow"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.pow.html#pyspark.pandas.Series.pow">[docs]</a> <span class="k">def</span> <span class="nf">pow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">**</span> <span class="n">other</span></div> |
| |
| <span class="nb">pow</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Exponential power of series"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"**"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series ** other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rpow"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_pow_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.rpow"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rpow.html#pyspark.pandas.Series.rpow">[docs]</a> <span class="k">def</span> <span class="nf">rpow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">**</span> <span class="bp">self</span></div> |
| |
| <span class="n">rpow</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Exponential power"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"**"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other ** series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"pow"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_pow_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.floordiv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.floordiv.html#pyspark.pandas.Series.floordiv">[docs]</a> <span class="k">def</span> <span class="nf">floordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">//</span> <span class="n">other</span></div> |
| |
| <span class="n">floordiv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Integer division"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"//"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"series // other"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"rfloordiv"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_floordiv_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.rfloordiv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rfloordiv.html#pyspark.pandas.Series.rfloordiv">[docs]</a> <span class="k">def</span> <span class="nf">rfloordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">//</span> <span class="bp">self</span></div> |
| |
| <span class="n">rfloordiv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_SERIES</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Reverse Integer division"</span><span class="p">,</span> |
| <span class="n">op_name</span><span class="o">=</span><span class="s2">"//"</span><span class="p">,</span> |
| <span class="n">equiv</span><span class="o">=</span><span class="s2">"other // series"</span><span class="p">,</span> |
| <span class="n">reverse</span><span class="o">=</span><span class="s2">"floordiv"</span><span class="p">,</span> |
| <span class="n">series_examples</span><span class="o">=</span><span class="n">_floordiv_example_SERIES</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># create accessor for pandas-on-Spark specific methods.</span> |
| <span class="n">pandas_on_spark</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"pandas_on_spark"</span><span class="p">,</span> <span class="n">PandasOnSparkSeriesMethods</span><span class="p">)</span> |
| |
| <span class="c1"># keep the name "koalas" for backward compatibility.</span> |
| <span class="n">koalas</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"koalas"</span><span class="p">,</span> <span class="n">PandasOnSparkSeriesMethods</span><span class="p">)</span> |
| |
| <span class="c1"># Comparison Operators</span> |
| <div class="viewcode-block" id="Series.eq"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.eq.html#pyspark.pandas.Series.eq">[docs]</a> <span class="k">def</span> <span class="nf">eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.a == 1</span> |
| <span class="sd"> a True</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c False</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: a, dtype: bool</span> |
| |
| <span class="sd"> >>> df.b.eq(1)</span> |
| <span class="sd"> a True</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c True</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">==</span> <span class="n">other</span></div> |
| |
| <span class="n">equals</span> <span class="o">=</span> <span class="n">eq</span> |
| |
| <div class="viewcode-block" id="Series.gt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.gt.html#pyspark.pandas.Series.gt">[docs]</a> <span class="k">def</span> <span class="nf">gt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is greater than the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.a > 1</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b True</span> |
| <span class="sd"> c True</span> |
| <span class="sd"> d True</span> |
| <span class="sd"> Name: a, dtype: bool</span> |
| |
| <span class="sd"> >>> df.b.gt(1)</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c False</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">></span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="Series.ge"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.ge.html#pyspark.pandas.Series.ge">[docs]</a> <span class="k">def</span> <span class="nf">ge</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is greater than or equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.a >= 2</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b True</span> |
| <span class="sd"> c True</span> |
| <span class="sd"> d True</span> |
| <span class="sd"> Name: a, dtype: bool</span> |
| |
| <span class="sd"> >>> df.b.ge(2)</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c False</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">>=</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="Series.lt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.lt.html#pyspark.pandas.Series.lt">[docs]</a> <span class="k">def</span> <span class="nf">lt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is less than the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.a < 1</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c False</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: a, dtype: bool</span> |
| |
| <span class="sd"> >>> df.b.lt(2)</span> |
| <span class="sd"> a True</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c True</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o"><</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="Series.le"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.le.html#pyspark.pandas.Series.le">[docs]</a> <span class="k">def</span> <span class="nf">le</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is less than or equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.a <= 2</span> |
| <span class="sd"> a True</span> |
| <span class="sd"> b True</span> |
| <span class="sd"> c False</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: a, dtype: bool</span> |
| |
| <span class="sd"> >>> df.b.le(2)</span> |
| <span class="sd"> a True</span> |
| <span class="sd"> b False</span> |
| <span class="sd"> c True</span> |
| <span class="sd"> d False</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o"><=</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="Series.ne"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.ne.html#pyspark.pandas.Series.ne">[docs]</a> <span class="k">def</span> <span class="nf">ne</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is not equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.a != 1</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b True</span> |
| <span class="sd"> c True</span> |
| <span class="sd"> d True</span> |
| <span class="sd"> Name: a, dtype: bool</span> |
| |
| <span class="sd"> >>> df.b.ne(1)</span> |
| <span class="sd"> a False</span> |
| <span class="sd"> b True</span> |
| <span class="sd"> c False</span> |
| <span class="sd"> d True</span> |
| <span class="sd"> Name: b, dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">!=</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="Series.divmod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.divmod.html#pyspark.pandas.Series.divmod">[docs]</a> <span class="k">def</span> <span class="nf">divmod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return Integer division and modulo of series and other, element-wise</span> |
| <span class="sd"> (binary operator `divmod`).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series or scalar value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> 2-Tuple of Series</span> |
| <span class="sd"> The result of the operation.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.rdivmod</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">floordiv</span><span class="p">(</span><span class="n">other</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">mod</span><span class="p">(</span><span class="n">other</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.rdivmod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rdivmod.html#pyspark.pandas.Series.rdivmod">[docs]</a> <span class="k">def</span> <span class="nf">rdivmod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return Integer division and modulo of series and other, element-wise</span> |
| <span class="sd"> (binary operator `rdivmod`).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series or scalar value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> 2-Tuple of Series</span> |
| <span class="sd"> The result of the operation.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.divmod</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">rfloordiv</span><span class="p">(</span><span class="n">other</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">rmod</span><span class="p">(</span><span class="n">other</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.between"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.between.html#pyspark.pandas.Series.between">[docs]</a> <span class="k">def</span> <span class="nf">between</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">left</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">inclusive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return boolean Series equivalent to left <= series <= right.</span> |
| <span class="sd"> This function returns a boolean vector containing `True` wherever the</span> |
| <span class="sd"> corresponding Series element is between the boundary values `left` and</span> |
| <span class="sd"> `right`. NA values are treated as `False`.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : scalar or list-like</span> |
| <span class="sd"> Left boundary.</span> |
| <span class="sd"> right : scalar or list-like</span> |
| <span class="sd"> Right boundary.</span> |
| <span class="sd"> inclusive : bool, default True</span> |
| <span class="sd"> Include boundaries.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series representing whether each element is between left and</span> |
| <span class="sd"> right (inclusive).</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.gt : Greater than of series and other.</span> |
| <span class="sd"> Series.lt : Less than of series and other.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is equivalent to ``(left <= ser) & (ser <= right)``</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([2, 0, 4, 8, np.nan])</span> |
| |
| <span class="sd"> Boundary values are included by default:</span> |
| |
| <span class="sd"> >>> s.between(1, 4)</span> |
| <span class="sd"> 0 True</span> |
| <span class="sd"> 1 False</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 False</span> |
| <span class="sd"> dtype: bool</span> |
| |
| <span class="sd"> With `inclusive` set to ``False`` boundary values are excluded:</span> |
| |
| <span class="sd"> >>> s.between(1, 4, inclusive=False)</span> |
| <span class="sd"> 0 True</span> |
| <span class="sd"> 1 False</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 False</span> |
| <span class="sd"> dtype: bool</span> |
| |
| <span class="sd"> `left` and `right` can be any scalar value:</span> |
| |
| <span class="sd"> >>> s = ps.Series(['Alice', 'Bob', 'Carol', 'Eve'])</span> |
| <span class="sd"> >>> s.between('Anna', 'Daniel')</span> |
| <span class="sd"> 0 False</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">inclusive</span><span class="p">:</span> |
| <span class="n">lmask</span> <span class="o">=</span> <span class="bp">self</span> <span class="o">>=</span> <span class="n">left</span> |
| <span class="n">rmask</span> <span class="o">=</span> <span class="bp">self</span> <span class="o"><=</span> <span class="n">right</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">lmask</span> <span class="o">=</span> <span class="bp">self</span> <span class="o">></span> <span class="n">left</span> |
| <span class="n">rmask</span> <span class="o">=</span> <span class="bp">self</span> <span class="o"><</span> <span class="n">right</span> |
| |
| <span class="k">return</span> <span class="n">lmask</span> <span class="o">&</span> <span class="n">rmask</span></div> |
| |
| <div class="viewcode-block" id="Series.cov"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.cov.html#pyspark.pandas.Series.cov">[docs]</a> <span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute covariance with Series, excluding missing values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series</span> |
| <span class="sd"> Series with which to compute the covariance.</span> |
| <span class="sd"> min_periods : int, optional</span> |
| <span class="sd"> Minimum number of observations needed to have a valid result.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> float</span> |
| <span class="sd"> Covariance between Series and other</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> s1 = ps.Series([0.90010907, 0.13484424, 0.62036035])</span> |
| <span class="sd"> >>> s2 = ps.Series([0.12528585, 0.26962463, 0.51111198])</span> |
| <span class="sd"> >>> s1.cov(s2)</span> |
| <span class="sd"> -0.016857626527158744</span> |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unsupported type: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">issubdtype</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">number</span><span class="p">):</span> <span class="c1"># type: ignore[arg-type]</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unsupported dtype: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">np</span><span class="o">.</span><span class="n">issubdtype</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">number</span><span class="p">):</span> <span class="c1"># type: ignore[arg-type]</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unsupported dtype: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">other</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| |
| <span class="n">min_periods</span> <span class="o">=</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">min_periods</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">min_periods</span> |
| |
| <span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">other</span><span class="o">.</span><span class="n">to_frame</span><span class="p">())</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">min_periods</span><span class="p">))</span> <span class="o"><</span> <span class="n">min_periods</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">covar_samp</span><span class="p">(</span><span class="o">*</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">1</span><span class="p">)[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span></div> |
| |
| <span class="c1"># TODO: NaN and None when ``arg`` is an empty dict</span> |
| <span class="c1"># TODO: Support ps.Series ``arg``</span> |
| <div class="viewcode-block" id="Series.map"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.map.html#pyspark.pandas.Series.map">[docs]</a> <span class="k">def</span> <span class="nf">map</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">arg</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">na_action</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Map values of Series according to input correspondence.</span> |
| |
| <span class="sd"> Used for substituting each value in a Series with another value,</span> |
| <span class="sd"> that may be derived from a function, a ``dict``.</span> |
| |
| <span class="sd"> .. note:: make sure the size of the dictionary is not huge because it could</span> |
| <span class="sd"> downgrade the performance or throw OutOfMemoryError due to a huge</span> |
| <span class="sd"> expression within Spark. Consider the input as a functions as an</span> |
| <span class="sd"> alternative instead in this case.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> arg : function, dict or pd.Series</span> |
| <span class="sd"> Mapping correspondence.</span> |
| <span class="sd"> na_action :</span> |
| <span class="sd"> If `ignore`, propagate NA values, without passing them to the mapping correspondence.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Same index as caller.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.apply : For applying more complex functions on a Series.</span> |
| <span class="sd"> DataFrame.applymap : Apply a function elementwise on a whole DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> When ``arg`` is a dictionary, values in Series that are not in the</span> |
| <span class="sd"> dictionary (as keys) are converted to ``None``. However, if the</span> |
| <span class="sd"> dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.</span> |
| <span class="sd"> provides a method for default values), then this default is used</span> |
| <span class="sd"> rather than ``None``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(['cat', 'dog', None, 'rabbit'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 cat</span> |
| <span class="sd"> 1 dog</span> |
| <span class="sd"> 2 None</span> |
| <span class="sd"> 3 rabbit</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> ``map`` accepts a ``dict``. Values that are not found</span> |
| <span class="sd"> in the ``dict`` are converted to ``None``, unless the dict has a default</span> |
| <span class="sd"> value (e.g. ``defaultdict``):</span> |
| |
| <span class="sd"> >>> s.map({'cat': 'kitten', 'dog': 'puppy'})</span> |
| <span class="sd"> 0 kitten</span> |
| <span class="sd"> 1 puppy</span> |
| <span class="sd"> 2 None</span> |
| <span class="sd"> 3 None</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> It also accepts a pandas Series:</span> |
| |
| <span class="sd"> >>> pser = pd.Series(['kitten', 'puppy'], index=['cat', 'dog'])</span> |
| <span class="sd"> >>> s.map(pser)</span> |
| <span class="sd"> 0 kitten</span> |
| <span class="sd"> 1 puppy</span> |
| <span class="sd"> 2 None</span> |
| <span class="sd"> 3 None</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> It also accepts a function:</span> |
| |
| <span class="sd"> >>> def format(x) -> str:</span> |
| <span class="sd"> ... return 'I am a {}'.format(x)</span> |
| |
| <span class="sd"> >>> s.map(format)</span> |
| <span class="sd"> 0 I am a cat</span> |
| <span class="sd"> 1 I am a dog</span> |
| <span class="sd"> 2 I am a None</span> |
| <span class="sd"> 3 I am a rabbit</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> To avoid applying the function to missing values (and keep them as NaN)</span> |
| <span class="sd"> na_action='ignore' can be used:</span> |
| |
| <span class="sd"> >>> s.map('I am a {}'.format, na_action='ignore')</span> |
| <span class="sd"> 0 I am a cat</span> |
| <span class="sd"> 1 I am a dog</span> |
| <span class="sd"> 2 None</span> |
| <span class="sd"> 3 I am a rabbit</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)):</span> |
| <span class="n">is_start</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="c1"># In case dictionary is empty.</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">))</span> |
| |
| <span class="k">for</span> <span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">arg</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="k">if</span> <span class="n">is_start</span><span class="p">:</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">to_replace</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span> |
| <span class="n">is_start</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">to_replace</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="s2">"__missing__"</span><span class="p">):</span> |
| <span class="n">tmp_val</span> <span class="o">=</span> <span class="n">arg</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">_NoValue</span><span class="p">]</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="c1"># Remove in case it's set in defaultdict.</span> |
| <span class="k">del</span> <span class="n">arg</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">_NoValue</span><span class="p">]</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">tmp_val</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">))</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">current</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="k">lambda</span> <span class="n">pser</span><span class="p">:</span> <span class="n">pser</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">na_action</span><span class="p">))</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">shape</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span> |
| <span class="sd">"""Return a tuple of the shape of the underlying data."""</span> |
| <span class="k">return</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">),)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">name</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Name</span><span class="p">:</span> |
| <span class="sd">"""Return name of the Series."""</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span> |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">name</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">name</span> |
| |
| <span class="nd">@name</span><span class="o">.</span><span class="n">setter</span> |
| <span class="k">def</span> <span class="nf">name</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Name</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: Currently, changing index labels taking dictionary/Series is not supported.</span> |
| <div class="viewcode-block" id="Series.rename"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rename.html#pyspark.pandas.Series.rename">[docs]</a> <span class="k">def</span> <span class="nf">rename</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Alter Series index labels or name.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index : scalar or function, optional</span> |
| <span class="sd"> Functions are transformations to apply to the index.</span> |
| <span class="sd"> Scalar will alter the Series.name attribute.</span> |
| |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Whether to return a new Series. If True then value of copy is</span> |
| <span class="sd"> ignored.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with index labels or name altered.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.rename("my_name") # scalar, changes Series.name</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> Name: my_name, dtype: int64</span> |
| |
| <span class="sd"> >>> s.rename(lambda x: x ** 2) # function, changes labels</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 4 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="k">if</span> <span class="n">callable</span><span class="p">(</span><span class="n">index</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"inplace"</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"inplace True is not supported yet for a function 'index'"</span><span class="p">)</span> |
| <span class="n">frame</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">new_index_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="s2">"__index_name__"</span><span class="p">)</span> |
| <span class="n">frame</span><span class="p">[</span><span class="n">new_index_name</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> |
| <span class="n">frame</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="n">new_index_name</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">frame</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">frame</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"'index' of </span><span class="si">%s</span><span class="s2"> type is not supported yet"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">index</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="n">is_hashable</span><span class="p">(</span><span class="n">index</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Series.name must be a hashable type"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="p">(</span><span class="n">index</span><span class="p">,)</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="n">field</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">index</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"inplace"</span><span class="p">,</span> <span class="kc">False</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_col_label</span> <span class="o">=</span> <span class="n">index</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.rename_axis"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rename_axis.html#pyspark.pandas.Series.rename_axis">[docs]</a> <span class="k">def</span> <span class="nf">rename_axis</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">mapper</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Set the name of the axis for the index or columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> mapper, index : scalar, list-like, dict-like or function, optional</span> |
| <span class="sd"> A scalar, list-like, dict-like or functions transformations to</span> |
| <span class="sd"> apply to the index values.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Modifies the object directly, instead of creating a new Series.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series, or None if `inplace` is True.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.rename : Alter Series index labels or name.</span> |
| <span class="sd"> DataFrame.rename : Alter DataFrame index labels or name.</span> |
| <span class="sd"> Index.rename : Set new names on index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(["dog", "cat", "monkey"], name="animal")</span> |
| <span class="sd"> >>> s # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> 0 dog</span> |
| <span class="sd"> 1 cat</span> |
| <span class="sd"> 2 monkey</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| <span class="sd"> >>> s.rename_axis("index").sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 dog</span> |
| <span class="sd"> 1 cat</span> |
| <span class="sd"> 2 monkey</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| |
| <span class="sd"> **MultiIndex**</span> |
| |
| <span class="sd"> >>> index = pd.MultiIndex.from_product([['mammal'],</span> |
| <span class="sd"> ... ['dog', 'cat', 'monkey']],</span> |
| <span class="sd"> ... names=['type', 'name'])</span> |
| <span class="sd"> >>> s = ps.Series([4, 4, 2], index=index, name='num_legs')</span> |
| <span class="sd"> >>> s # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> type name</span> |
| <span class="sd"> mammal dog 4</span> |
| <span class="sd"> cat 4</span> |
| <span class="sd"> monkey 2</span> |
| <span class="sd"> Name: num_legs, dtype: int64</span> |
| <span class="sd"> >>> s.rename_axis(index={'type': 'class'}).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> class name</span> |
| <span class="sd"> mammal cat 4</span> |
| <span class="sd"> dog 4</span> |
| <span class="sd"> monkey 2</span> |
| <span class="sd"> Name: num_legs, dtype: int64</span> |
| <span class="sd"> >>> s.rename_axis(index=str.upper).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> TYPE NAME</span> |
| <span class="sd"> mammal cat 4</span> |
| <span class="sd"> dog 4</span> |
| <span class="sd"> monkey 2</span> |
| <span class="sd"> Name: num_legs, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span><span class="n">mapper</span><span class="o">=</span><span class="n">mapper</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ps.Index"</span><span class="p">:</span> |
| <span class="sd">"""The index (axis labels) Column of the Series.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Index</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">index</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">is_unique</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return boolean if values in the object are unique</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> is_unique : boolean</span> |
| |
| <span class="sd"> >>> ps.Series([1, 2, 3]).is_unique</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> ps.Series([1, 2, 2]).is_unique</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> ps.Series([1, 2, 3, None]).is_unique</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| |
| <span class="c1"># Here we check:</span> |
| <span class="c1"># 1. the distinct count without nulls and count without nulls for non-null values</span> |
| <span class="c1"># 2. count null values and see if null is a distinct value.</span> |
| <span class="c1">#</span> |
| <span class="c1"># This workaround is in order to calculate the distinct count including nulls in</span> |
| <span class="c1"># single pass. Note that COUNT(DISTINCT expr) in Spark is designed to ignore nulls.</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">scol</span><span class="p">))</span> |
| <span class="o">&</span> <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o"><=</span> <span class="mi">1</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span> |
| |
| <div class="viewcode-block" id="Series.reset_index"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.reset_index.html#pyspark.pandas.Series.reset_index">[docs]</a> <span class="k">def</span> <span class="nf">reset_index</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">drop</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Generate a new DataFrame or Series with the index reset.</span> |
| |
| <span class="sd"> This is useful when the index needs to be treated as a column,</span> |
| <span class="sd"> or when the index is meaningless and needs to be reset</span> |
| <span class="sd"> to the default before another operation.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> level : int, str, tuple, or list, default optional</span> |
| <span class="sd"> For a Series with a MultiIndex, only remove the specified levels from the index.</span> |
| <span class="sd"> Removes all levels by default.</span> |
| <span class="sd"> drop : bool, default False</span> |
| <span class="sd"> Just reset the index, without inserting it as a column in the new DataFrame.</span> |
| <span class="sd"> name : object, optional</span> |
| <span class="sd"> The name to use for the column containing the original Series values.</span> |
| <span class="sd"> Uses self.name by default. This argument is ignored when drop is True.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Modify the Series in place (do not create a new object).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> When `drop` is False (the default), a DataFrame is returned.</span> |
| <span class="sd"> The newly created columns will come first in the DataFrame,</span> |
| <span class="sd"> followed by the original Series values.</span> |
| <span class="sd"> When `drop` is True, a `Series` is returned.</span> |
| <span class="sd"> In either case, if ``inplace=True``, no value is returned.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4], index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))</span> |
| |
| <span class="sd"> Generate a DataFrame with default index.</span> |
| |
| <span class="sd"> >>> s.reset_index()</span> |
| <span class="sd"> idx 0</span> |
| <span class="sd"> 0 a 1</span> |
| <span class="sd"> 1 b 2</span> |
| <span class="sd"> 2 c 3</span> |
| <span class="sd"> 3 d 4</span> |
| |
| <span class="sd"> To specify the name of the new column use `name`.</span> |
| |
| <span class="sd"> >>> s.reset_index(name='values')</span> |
| <span class="sd"> idx values</span> |
| <span class="sd"> 0 a 1</span> |
| <span class="sd"> 1 b 2</span> |
| <span class="sd"> 2 c 3</span> |
| <span class="sd"> 3 d 4</span> |
| |
| <span class="sd"> To generate a new Series with the default set `drop` to True.</span> |
| |
| <span class="sd"> >>> s.reset_index(drop=True)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> To update the Series in place, without generating a new one</span> |
| <span class="sd"> set `inplace` to True. Note that it also requires ``drop=True``.</span> |
| |
| <span class="sd"> >>> s.reset_index(inplace=True, drop=True)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">drop</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Cannot reset_index inplace on a Series to create a DataFrame"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">drop</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">level</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="n">drop</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">drop</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="Series.to_frame"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_frame.html#pyspark.pandas.Series.to_frame">[docs]</a> <span class="k">def</span> <span class="nf">to_frame</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert Series to DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : object, default None</span> |
| <span class="sd"> The passed name should substitute for the series name (if it has</span> |
| <span class="sd"> one).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame representation of Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(["a", "b", "c"])</span> |
| <span class="sd"> >>> s.to_frame()</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| |
| <span class="sd"> >>> s = ps.Series(["a", "b", "c"], name="vals")</span> |
| <span class="sd"> >>> s.to_frame()</span> |
| <span class="sd"> vals</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">renamed</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">renamed</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">DEFAULT_SERIES_NAME</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">renamed</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">renamed</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span></div> |
| |
| <span class="n">to_dataframe</span> <span class="o">=</span> <span class="n">to_frame</span> |
| |
| <div class="viewcode-block" id="Series.to_string"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_string.html#pyspark.pandas.Series.to_string">[docs]</a> <span class="k">def</span> <span class="nf">to_string</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"NaN"</span><span class="p">,</span> |
| <span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">float</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">length</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">dtype</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">name</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">max_rows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Render a string representation of the Series.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory. If the input</span> |
| <span class="sd"> is large, set max_rows parameter.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> buf : StringIO-like, optional</span> |
| <span class="sd"> buffer to write to</span> |
| <span class="sd"> na_rep : string, optional</span> |
| <span class="sd"> string representation of NAN to use, default 'NaN'</span> |
| <span class="sd"> float_format : one-parameter function, optional</span> |
| <span class="sd"> formatter function to apply to columns' elements if they are floats</span> |
| <span class="sd"> default None</span> |
| <span class="sd"> header : boolean, default True</span> |
| <span class="sd"> Add the Series header (index name)</span> |
| <span class="sd"> index : bool, optional</span> |
| <span class="sd"> Add index (row) labels, default True</span> |
| <span class="sd"> length : boolean, default False</span> |
| <span class="sd"> Add the Series length</span> |
| <span class="sd"> dtype : boolean, default False</span> |
| <span class="sd"> Add the Series dtype</span> |
| <span class="sd"> name : boolean, default False</span> |
| <span class="sd"> Add the Series name if not None</span> |
| <span class="sd"> max_rows : int, optional</span> |
| <span class="sd"> Maximum number of rows to show before truncating. If None, show</span> |
| <span class="sd"> all.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> formatted : string (if not buffer passed)</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], columns=['dogs', 'cats'])</span> |
| <span class="sd"> >>> print(df['dogs'].to_string())</span> |
| <span class="sd"> 0 0.2</span> |
| <span class="sd"> 1 0.0</span> |
| <span class="sd"> 2 0.6</span> |
| <span class="sd"> 3 0.2</span> |
| |
| <span class="sd"> >>> print(df['dogs'].to_string(max_rows=2))</span> |
| <span class="sd"> 0 0.2</span> |
| <span class="sd"> 1 0.0</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">max_rows</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">max_rows</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psseries</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_string</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_string</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.to_clipboard"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_clipboard.html#pyspark.pandas.Series.to_clipboard">[docs]</a> <span class="k">def</span> <span class="nf">to_clipboard</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">excel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">sep</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="c1"># Docstring defined below by reusing DataFrame.to_clipboard's.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psseries</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_clipboard</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_clipboard</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <span class="n">to_clipboard</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">to_clipboard</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="Series.to_dict"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_dict.html#pyspark.pandas.Series.to_dict">[docs]</a> <span class="k">def</span> <span class="nf">to_dict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">into</span><span class="p">:</span> <span class="n">Type</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="n">Mapping</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert Series to {label -> value} dict or dict-like object.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas DataFrame is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> into : class, default dict</span> |
| <span class="sd"> The collections.abc.Mapping subclass to use as the return</span> |
| <span class="sd"> object. Can be the actual class or an empty</span> |
| <span class="sd"> instance of the mapping type you want. If you want a</span> |
| <span class="sd"> collections.defaultdict, you must pass it initialized.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> collections.abc.Mapping</span> |
| <span class="sd"> Key-value representation of Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4])</span> |
| <span class="sd"> >>> s_dict = s.to_dict()</span> |
| <span class="sd"> >>> sorted(s_dict.items())</span> |
| <span class="sd"> [(0, 1), (1, 2), (2, 3), (3, 4)]</span> |
| |
| <span class="sd"> >>> from collections import OrderedDict, defaultdict</span> |
| <span class="sd"> >>> s.to_dict(OrderedDict)</span> |
| <span class="sd"> OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])</span> |
| |
| <span class="sd"> >>> dd = defaultdict(list)</span> |
| <span class="sd"> >>> s.to_dict(dd) # doctest: +ELLIPSIS</span> |
| <span class="sd"> defaultdict(<class 'list'>, {...})</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psseries</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_dict</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_dict</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.to_latex"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_latex.html#pyspark.pandas.Series.to_latex">[docs]</a> <span class="k">def</span> <span class="nf">to_latex</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">col_space</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"NaN"</span><span class="p">,</span> |
| <span class="n">formatters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span> |
| <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]]</span> |
| <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">float</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sparsify</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">bold_rows</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">column_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">longtable</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">escape</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"."</span><span class="p">,</span> |
| <span class="n">multicolumn</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">multicolumn_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">multirow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psseries</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_latex</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_latex</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <span class="n">to_latex</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">to_latex</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="Series.to_pandas"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_pandas.html#pyspark.pandas.Series.to_pandas">[docs]</a> <span class="k">def</span> <span class="nf">to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a pandas Series.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], columns=['dogs', 'cats'])</span> |
| <span class="sd"> >>> df['dogs'].to_pandas()</span> |
| <span class="sd"> 0 0.2</span> |
| <span class="sd"> 1 0.0</span> |
| <span class="sd"> 2 0.6</span> |
| <span class="sd"> 3 0.2</span> |
| <span class="sd"> Name: dogs, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"`to_pandas` loads all data into the driver's memory. "</span> |
| <span class="s2">"It should only be used if the resulting pandas Series is expected to be small."</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span></div> |
| |
| <span class="k">def</span> <span class="nf">_to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Same as `to_pandas()`, without issueing the advice log for internal usage.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <div class="viewcode-block" id="Series.to_list"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.to_list.html#pyspark.pandas.Series.to_list">[docs]</a> <span class="k">def</span> <span class="nf">to_list</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a list of the values.</span> |
| |
| <span class="sd"> These are each a scalar type, which is a Python scalar</span> |
| <span class="sd"> (for str, int, float) or a pandas scalar</span> |
| <span class="sd"> (for Timestamp/Timedelta/Interval/Period)</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting list is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> """</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"`to_list` loads all data into the driver's memory. "</span> |
| <span class="s2">"It should only be used if the resulting list is expected to be small."</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span></div> |
| |
| <span class="n">tolist</span> <span class="o">=</span> <span class="n">to_list</span> |
| |
| <div class="viewcode-block" id="Series.drop_duplicates"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.drop_duplicates.html#pyspark.pandas.Series.drop_duplicates">[docs]</a> <span class="k">def</span> <span class="nf">drop_duplicates</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">keep</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"first"</span><span class="p">,</span> <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return Series with duplicate values removed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> keep : {'first', 'last', ``False``}, default 'first'</span> |
| <span class="sd"> Method to handle dropping duplicates:</span> |
| <span class="sd"> - 'first' : Drop duplicates except for the first occurrence.</span> |
| <span class="sd"> - 'last' : Drop duplicates except for the last occurrence.</span> |
| <span class="sd"> - ``False`` : Drop all duplicates.</span> |
| <span class="sd"> inplace : bool, default ``False``</span> |
| <span class="sd"> If ``True``, performs operation inplace and returns None.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with duplicates dropped.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Generate a Series with duplicated entries.</span> |
| |
| <span class="sd"> >>> s = ps.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'],</span> |
| <span class="sd"> ... name='animal')</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 0 lama</span> |
| <span class="sd"> 1 cow</span> |
| <span class="sd"> 2 lama</span> |
| <span class="sd"> 3 beetle</span> |
| <span class="sd"> 4 lama</span> |
| <span class="sd"> 5 hippo</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| |
| <span class="sd"> With the 'keep' parameter, the selection behaviour of duplicated values</span> |
| <span class="sd"> can be changed. The value 'first' keeps the first occurrence for each</span> |
| <span class="sd"> set of duplicated entries. The default value of keep is 'first'.</span> |
| |
| <span class="sd"> >>> s.drop_duplicates().sort_index()</span> |
| <span class="sd"> 0 lama</span> |
| <span class="sd"> 1 cow</span> |
| <span class="sd"> 3 beetle</span> |
| <span class="sd"> 5 hippo</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| |
| <span class="sd"> The value 'last' for parameter 'keep' keeps the last occurrence for</span> |
| <span class="sd"> each set of duplicated entries.</span> |
| |
| <span class="sd"> >>> s.drop_duplicates(keep='last').sort_index()</span> |
| <span class="sd"> 1 cow</span> |
| <span class="sd"> 3 beetle</span> |
| <span class="sd"> 4 lama</span> |
| <span class="sd"> 5 hippo</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| |
| <span class="sd"> The value ``False`` for parameter 'keep' discards all sets of</span> |
| <span class="sd"> duplicated entries. Setting the value of 'inplace' to ``True`` performs</span> |
| <span class="sd"> the operation inplace and returns ``None``.</span> |
| |
| <span class="sd"> >>> s.drop_duplicates(keep=False, inplace=True)</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 1 cow</span> |
| <span class="sd"> 3 beetle</span> |
| <span class="sd"> 5 hippo</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]]</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">(</span><span class="n">keep</span><span class="o">=</span><span class="n">keep</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.reindex"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.reindex.html#pyspark.pandas.Series.reindex">[docs]</a> <span class="k">def</span> <span class="nf">reindex</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Conform Series to new index with optional filling logic, placing</span> |
| <span class="sd"> NA/NaN in locations having no value in the previous index. A new object</span> |
| <span class="sd"> is produced.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index: array-like, optional</span> |
| <span class="sd"> New labels / index to conform to, should be specified using keywords.</span> |
| <span class="sd"> Preferably an Index object to avoid duplicating data</span> |
| <span class="sd"> fill_value : scalar, default np.NaN</span> |
| <span class="sd"> Value to use for missing values. Defaults to NaN, but can be any</span> |
| <span class="sd"> "compatible" value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series with changed index.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.reset_index : Remove row labels or move them to new columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> Create a series with some fictional data.</span> |
| |
| <span class="sd"> >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']</span> |
| <span class="sd"> >>> ser = ps.Series([200, 200, 404, 404, 301],</span> |
| <span class="sd"> ... index=index, name='http_status')</span> |
| <span class="sd"> >>> ser</span> |
| <span class="sd"> Firefox 200</span> |
| <span class="sd"> Chrome 200</span> |
| <span class="sd"> Safari 404</span> |
| <span class="sd"> IE10 404</span> |
| <span class="sd"> Konqueror 301</span> |
| <span class="sd"> Name: http_status, dtype: int64</span> |
| |
| <span class="sd"> Create a new index and reindex the Series. By default</span> |
| <span class="sd"> values in the new index that do not have corresponding</span> |
| <span class="sd"> records in the Series are assigned ``NaN``.</span> |
| |
| <span class="sd"> >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',</span> |
| <span class="sd"> ... 'Chrome']</span> |
| <span class="sd"> >>> ser.reindex(new_index).sort_index()</span> |
| <span class="sd"> Chrome 200.0</span> |
| <span class="sd"> Comodo Dragon NaN</span> |
| <span class="sd"> IE10 404.0</span> |
| <span class="sd"> Iceweasel NaN</span> |
| <span class="sd"> Safari 404.0</span> |
| <span class="sd"> Name: http_status, dtype: float64</span> |
| |
| <span class="sd"> We can fill in the missing values by passing a value to</span> |
| <span class="sd"> the keyword ``fill_value``.</span> |
| |
| <span class="sd"> >>> ser.reindex(new_index, fill_value=0).sort_index()</span> |
| <span class="sd"> Chrome 200</span> |
| <span class="sd"> Comodo Dragon 0</span> |
| <span class="sd"> IE10 404</span> |
| <span class="sd"> Iceweasel 0</span> |
| <span class="sd"> Safari 404</span> |
| <span class="sd"> Name: http_status, dtype: int64</span> |
| |
| <span class="sd"> To further illustrate the filling functionality in</span> |
| <span class="sd"> ``reindex``, we will create a Series with a</span> |
| <span class="sd"> monotonically increasing index (for example, a sequence</span> |
| <span class="sd"> of dates).</span> |
| |
| <span class="sd"> >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')</span> |
| <span class="sd"> >>> ser2 = ps.Series([100, 101, np.nan, 100, 89, 88],</span> |
| <span class="sd"> ... name='prices', index=date_index)</span> |
| <span class="sd"> >>> ser2.sort_index()</span> |
| <span class="sd"> 2010-01-01 100.0</span> |
| <span class="sd"> 2010-01-02 101.0</span> |
| <span class="sd"> 2010-01-03 NaN</span> |
| <span class="sd"> 2010-01-04 100.0</span> |
| <span class="sd"> 2010-01-05 89.0</span> |
| <span class="sd"> 2010-01-06 88.0</span> |
| <span class="sd"> Name: prices, dtype: float64</span> |
| |
| <span class="sd"> Suppose we decide to expand the series to cover a wider</span> |
| <span class="sd"> date range.</span> |
| |
| <span class="sd"> >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')</span> |
| <span class="sd"> >>> ser2.reindex(date_index2).sort_index()</span> |
| <span class="sd"> 2009-12-29 NaN</span> |
| <span class="sd"> 2009-12-30 NaN</span> |
| <span class="sd"> 2009-12-31 NaN</span> |
| <span class="sd"> 2010-01-01 100.0</span> |
| <span class="sd"> 2010-01-02 101.0</span> |
| <span class="sd"> 2010-01-03 NaN</span> |
| <span class="sd"> 2010-01-04 100.0</span> |
| <span class="sd"> 2010-01-05 89.0</span> |
| <span class="sd"> 2010-01-06 88.0</span> |
| <span class="sd"> 2010-01-07 NaN</span> |
| <span class="sd"> Name: prices, dtype: float64</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">reindex</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> <span class="n">fill_value</span><span class="o">=</span><span class="n">fill_value</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">name</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.reindex_like"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.reindex_like.html#pyspark.pandas.Series.reindex_like">[docs]</a> <span class="k">def</span> <span class="nf">reindex_like</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="s2">"DataFrame"</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a Series with matching indices as other object.</span> |
| |
| <span class="sd"> Conform the object to the same index on all axes. Places NA/NaN in locations</span> |
| <span class="sd"> having no value in the previous index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series or DataFrame</span> |
| <span class="sd"> Its row and column indices are used to define the new indices</span> |
| <span class="sd"> of this object.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with changed indices on each axis.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.set_index : Set row labels.</span> |
| <span class="sd"> DataFrame.reset_index : Remove row labels or move them to new columns.</span> |
| <span class="sd"> DataFrame.reindex : Change to new indices or expand indices.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Same as calling</span> |
| <span class="sd"> ``.reindex(index=other.index, ...)``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> s1 = ps.Series([24.3, 31.0, 22.0, 35.0],</span> |
| <span class="sd"> ... index=pd.date_range(start='2014-02-12',</span> |
| <span class="sd"> ... end='2014-02-15', freq='D'),</span> |
| <span class="sd"> ... name="temp_celsius")</span> |
| <span class="sd"> >>> s1</span> |
| <span class="sd"> 2014-02-12 24.3</span> |
| <span class="sd"> 2014-02-13 31.0</span> |
| <span class="sd"> 2014-02-14 22.0</span> |
| <span class="sd"> 2014-02-15 35.0</span> |
| <span class="sd"> Name: temp_celsius, dtype: float64</span> |
| |
| <span class="sd"> >>> s2 = ps.Series(["low", "low", "medium"],</span> |
| <span class="sd"> ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',</span> |
| <span class="sd"> ... '2014-02-15']),</span> |
| <span class="sd"> ... name="winspeed")</span> |
| <span class="sd"> >>> s2</span> |
| <span class="sd"> 2014-02-12 low</span> |
| <span class="sd"> 2014-02-13 low</span> |
| <span class="sd"> 2014-02-15 medium</span> |
| <span class="sd"> Name: winspeed, dtype: object</span> |
| |
| <span class="sd"> >>> s2.reindex_like(s1).sort_index()</span> |
| <span class="sd"> 2014-02-12 low</span> |
| <span class="sd"> 2014-02-13 low</span> |
| <span class="sd"> 2014-02-14 None</span> |
| <span class="sd"> 2014-02-15 medium</span> |
| <span class="sd"> Name: winspeed, dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reindex</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"other must be a pandas-on-Spark Series or DataFrame"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.fillna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.fillna.html#pyspark.pandas.Series.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""Fill NA/NaN values.</span> |
| |
| <span class="sd"> .. note:: the current implementation of 'method' parameter in fillna uses Spark's Window</span> |
| <span class="sd"> without specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> value : scalar, dict, Series</span> |
| <span class="sd"> Value to use to fill holes. alternately a dict/Series of values</span> |
| <span class="sd"> specifying which value to use for each column.</span> |
| <span class="sd"> DataFrame is not supported.</span> |
| <span class="sd"> method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None</span> |
| <span class="sd"> Method to use for filling holes in reindexed Series pad / ffill: propagate last valid</span> |
| <span class="sd"> observation forward to next valid backfill / bfill:</span> |
| <span class="sd"> use NEXT valid observation to fill gap</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([np.nan, 2, 3, 4, np.nan, 6], name='x')</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> Name: x, dtype: float64</span> |
| |
| <span class="sd"> Replace all NaN elements with 0s.</span> |
| |
| <span class="sd"> >>> s.fillna(0)</span> |
| <span class="sd"> 0 0.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 4 0.0</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> Name: x, dtype: float64</span> |
| |
| <span class="sd"> We can also propagate non-null values forward or backward.</span> |
| |
| <span class="sd"> >>> s.fillna(method='ffill')</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 4 4.0</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> Name: x, dtype: float64</span> |
| |
| <span class="sd"> >>> s = ps.Series([np.nan, 'a', 'b', 'c', np.nan], name='x')</span> |
| <span class="sd"> >>> s.fillna(method='ffill')</span> |
| <span class="sd"> 0 None</span> |
| <span class="sd"> 1 a</span> |
| <span class="sd"> 2 b</span> |
| <span class="sd"> 3 c</span> |
| <span class="sd"> 4 c</span> |
| <span class="sd"> Name: x, dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span><span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">)</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="n">requires_same_anchor</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span></div> |
| |
| <span class="k">def</span> <span class="nf">_fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"fillna currently only works for axis=0 or axis='index'"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">value</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span><span class="n">method</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must specify a fillna 'value' or 'method' parameter."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span><span class="n">method</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"ffill"</span><span class="p">,</span> <span class="s2">"pad"</span><span class="p">,</span> <span class="s2">"backfill"</span><span class="p">,</span> <span class="s2">"bfill"</span><span class="p">]):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Expecting 'pad', 'ffill', 'backfill' or 'bfill'."</span><span class="p">)</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">nullable</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">FloatType</span><span class="p">,</span> <span class="n">DoubleType</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| |
| <span class="n">cond</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">limit</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"limit parameter for value is not support now"</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">method</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"ffill"</span><span class="p">,</span> <span class="s2">"pad"</span><span class="p">]:</span> |
| <span class="n">func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">last</span> |
| <span class="n">end</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span> <span class="o">-</span> <span class="mi">1</span> |
| <span class="k">if</span> <span class="n">limit</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">begin</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span> <span class="o">-</span> <span class="n">limit</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">begin</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span> |
| <span class="k">elif</span> <span class="n">method</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"bfill"</span><span class="p">,</span> <span class="s2">"backfill"</span><span class="p">]:</span> |
| <span class="n">func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">first</span> |
| <span class="n">begin</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span> <span class="o">+</span> <span class="mi">1</span> |
| <span class="k">if</span> <span class="n">limit</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">end</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span> <span class="o">+</span> <span class="n">limit</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">end</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">unboundedFollowing</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">begin</span><span class="p">,</span> <span class="n">end</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">func</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_spark_column</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">,</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="c1"># TODO: dtype?</span> |
| <span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.dropna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.dropna.html#pyspark.pandas.Series.dropna">[docs]</a> <span class="k">def</span> <span class="nf">dropna</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a new Series with missing values removed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or 'index'}, default 0</span> |
| <span class="sd"> There is only one axis to drop values from.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> If True, do operation inplace and return None.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Not in use.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with NA entries dropped from it.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ser = ps.Series([1., 2., np.nan])</span> |
| <span class="sd"> >>> ser</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Drop NA values from a Series.</span> |
| |
| <span class="sd"> >>> ser.dropna()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Keep the Series with valid entries in the same variable.</span> |
| |
| <span class="sd"> >>> ser.dropna(inplace=True)</span> |
| <span class="sd"> >>> ser</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="c1"># TODO: last two examples from pandas produce different results.</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]]</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.clip"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.clip.html#pyspark.pandas.Series.clip">[docs]</a> <span class="k">def</span> <span class="nf">clip</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">lower</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">upper</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Trim values at input threshold(s).</span> |
| |
| <span class="sd"> Assigns values outside boundary to boundary values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> lower : float or int, default None</span> |
| <span class="sd"> Minimum threshold value. All values below this threshold will be set to it.</span> |
| <span class="sd"> upper : float or int, default None</span> |
| <span class="sd"> Maximum threshold value. All values above this threshold will be set to it.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with the values outside the clip boundaries replaced</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.Series([0, 2, 4]).clip(1, 3)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> One difference between this implementation and pandas is that running</span> |
| <span class="sd"> `pd.Series(['a', 'b']).clip(0, 1)` will crash with "TypeError: '<=' not supported between</span> |
| <span class="sd"> instances of 'str' and 'int'" while `ps.Series(['a', 'b']).clip(0, 1)` will output the</span> |
| <span class="sd"> original Series, simply ignoring the incompatible types.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">lower</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">upper</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"List-like value are not supported for 'lower' and 'upper' at the "</span> <span class="o">+</span> <span class="s2">"moment"</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">lower</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">upper</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="n">lower</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span> <span class="o"><</span> <span class="n">lower</span><span class="p">,</span> <span class="n">lower</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">upper</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span> <span class="o">></span> <span class="n">upper</span><span class="p">,</span> <span class="n">upper</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> |
| <span class="n">field</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="Series.drop"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.drop.html#pyspark.pandas.Series.drop">[docs]</a> <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return Series with specified index labels removed.</span> |
| |
| <span class="sd"> Remove elements of a Series based on specifying the index labels.</span> |
| <span class="sd"> When using a multi-index, labels on different levels can be removed by specifying the level.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> labels : single label or list-like</span> |
| <span class="sd"> Index labels to drop.</span> |
| <span class="sd"> index : None</span> |
| <span class="sd"> Redundant for application on Series, but index can be used instead of labels.</span> |
| <span class="sd"> level : int or level name, optional</span> |
| <span class="sd"> For MultiIndex, level for which the labels will be removed.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with specified index labels removed.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.dropna</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(data=np.arange(3), index=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> A 0</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Drop single label A</span> |
| |
| <span class="sd"> >>> s.drop('A')</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Drop labels B and C</span> |
| |
| <span class="sd"> >>> s.drop(labels=['B', 'C'])</span> |
| <span class="sd"> A 0</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> With 'index' rather than 'labels' returns exactly same result.</span> |
| |
| <span class="sd"> >>> s.drop(index='A')</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.drop(index=['B', 'C'])</span> |
| <span class="sd"> A 0</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Also support for MultiIndex</span> |
| |
| <span class="sd"> >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],</span> |
| <span class="sd"> ... index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> lama speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.drop(labels='weight', level=1)</span> |
| <span class="sd"> lama speed 45.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.drop(('lama', 'weight'))</span> |
| <span class="sd"> lama speed 45.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.drop([('lama', 'speed'), ('falcon', 'weight')])</span> |
| <span class="sd"> lama weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_drop</span><span class="p">(</span><span class="n">labels</span><span class="o">=</span><span class="n">labels</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> <span class="n">level</span><span class="o">=</span><span class="n">level</span><span class="p">))</span></div> |
| |
| <span class="k">def</span> <span class="nf">_drop</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">labels</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Cannot specify both 'labels' and 'index'"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_drop</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">labels</span><span class="p">,</span> <span class="n">level</span><span class="o">=</span><span class="n">level</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="o">>=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"'level' should be less than the number of indexes"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">index</span><span class="p">):</span> |
| <span class="n">index_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">index</span><span class="p">)]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">index</span><span class="p">):</span> |
| <span class="n">index_list</span> <span class="o">=</span> <span class="p">[(</span><span class="n">index</span><span class="p">,)]</span> |
| <span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="n">is_name_like_value</span><span class="p">(</span><span class="n">idxes</span><span class="p">,</span> <span class="n">allow_tuple</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="k">for</span> <span class="n">idxes</span> <span class="ow">in</span> <span class="n">index</span><span class="p">):</span> |
| <span class="n">index_list</span> <span class="o">=</span> <span class="p">[(</span><span class="n">idex</span><span class="p">,)</span> <span class="k">for</span> <span class="n">idex</span> <span class="ow">in</span> <span class="n">index</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">idxes</span><span class="p">)</span> <span class="k">for</span> <span class="n">idxes</span> <span class="ow">in</span> <span class="n">index</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"If the given index is a list, it "</span> |
| <span class="s2">"should only contains names as all tuples or all non tuples "</span> |
| <span class="s2">"that contain index names"</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_list</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">index</span><span class="p">)</span> |
| |
| <span class="n">drop_index_scols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">idxes</span> <span class="ow">in</span> <span class="n">index_list</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">lvl</span><span class="p">]</span> <span class="o">==</span> <span class="n">idx</span> |
| <span class="k">for</span> <span class="n">lvl</span><span class="p">,</span> <span class="n">idx</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">idxes</span><span class="p">,</span> <span class="n">level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">except</span> <span class="ne">IndexError</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Key length (</span><span class="si">{}</span><span class="s2">) exceeds index depth (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">idxes</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">drop_index_scols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> <span class="n">index_scols</span><span class="p">))</span> |
| |
| <span class="n">cond</span> <span class="o">=</span> <span class="o">~</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">|</span> <span class="n">y</span><span class="p">,</span> <span class="n">drop_index_scols</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Need to specify at least one of 'labels' or 'index'"</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.head"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.head.html#pyspark.pandas.Series.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the first n rows.</span> |
| |
| <span class="sd"> This function returns the first n rows for the object based on position.</span> |
| <span class="sd"> It is useful for quickly testing if your object has the right type of data in it.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : Integer, default = 5</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> The first n rows of the caller object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion']})</span> |
| <span class="sd"> >>> df.animal.head(2) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> 0 alligator</span> |
| <span class="sd"> 1 bee</span> |
| <span class="sd"> Name: animal, dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.last"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.last.html#pyspark.pandas.Series.last">[docs]</a> <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select final periods of time series data based on a date offset.</span> |
| |
| <span class="sd"> When having a Series with dates as index, this function can</span> |
| <span class="sd"> select the last few elements based on a date offset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> offset : str or DateOffset</span> |
| <span class="sd"> The offset length of the data that will be selected. For instance,</span> |
| <span class="sd"> '3D' will display all the rows having their index within the last 3 days.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> A subset of the caller.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')</span> |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3, 4], index=index)</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 2018-04-09 1</span> |
| <span class="sd"> 2018-04-11 2</span> |
| <span class="sd"> 2018-04-13 3</span> |
| <span class="sd"> 2018-04-15 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Get the rows for the last 3 days:</span> |
| |
| <span class="sd"> >>> psser.last('3D')</span> |
| <span class="sd"> 2018-04-13 3</span> |
| <span class="sd"> 2018-04-15 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Notice the data for 3 last calendar days were returned, not the last</span> |
| <span class="sd"> 3 observed days in the dataset, and therefore data for 2018-04-11 was</span> |
| <span class="sd"> not returned.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">last</span><span class="p">(</span><span class="n">offset</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.first.html#pyspark.pandas.Series.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select first periods of time series data based on a date offset.</span> |
| |
| <span class="sd"> When having a Series with dates as index, this function can</span> |
| <span class="sd"> select the first few elements based on a date offset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> offset : str or DateOffset</span> |
| <span class="sd"> The offset length of the data that will be selected. For instance,</span> |
| <span class="sd"> '3D' will display all the rows having their index within the first 3 days.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> A subset of the caller.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')</span> |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3, 4], index=index)</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 2018-04-09 1</span> |
| <span class="sd"> 2018-04-11 2</span> |
| <span class="sd"> 2018-04-13 3</span> |
| <span class="sd"> 2018-04-15 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Get the rows for the first 3 days:</span> |
| |
| <span class="sd"> >>> psser.first('3D')</span> |
| <span class="sd"> 2018-04-09 1</span> |
| <span class="sd"> 2018-04-11 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Notice the data for 3 first calendar days were returned, not the first</span> |
| <span class="sd"> 3 observed days in the dataset, and therefore data for 2018-04-13 was</span> |
| <span class="sd"> not returned.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">offset</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: Categorical type isn't supported (due to PySpark's limitation) and</span> |
| <span class="c1"># some doctests related with timestamps were not added.</span> |
| <div class="viewcode-block" id="Series.unique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.unique.html#pyspark.pandas.Series.unique">[docs]</a> <span class="k">def</span> <span class="nf">unique</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return unique values of Series object.</span> |
| |
| <span class="sd"> Uniques are returned in order of appearance. Hash table-based unique,</span> |
| <span class="sd"> therefore does NOT sort.</span> |
| |
| <span class="sd"> .. note:: This method returns newly created Series whereas pandas returns</span> |
| <span class="sd"> the unique values as a NumPy array.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Returns the unique values as a Series.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Index.unique</span> |
| <span class="sd"> groupby.SeriesGroupBy.unique</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series([2, 1, 3, 3], name='A')</span> |
| <span class="sd"> >>> psser.unique().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS</span> |
| <span class="sd"> <BLANKLINE></span> |
| <span class="sd"> ... 1</span> |
| <span class="sd"> ... 2</span> |
| <span class="sd"> ... 3</span> |
| <span class="sd"> Name: A, dtype: int64</span> |
| |
| <span class="sd"> >>> ps.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()</span> |
| <span class="sd"> 0 2016-01-01</span> |
| <span class="sd"> dtype: datetime64[ns]</span> |
| |
| <span class="sd"> >>> psser.name = ('x', 'a')</span> |
| <span class="sd"> >>> psser.unique().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS</span> |
| <span class="sd"> <BLANKLINE></span> |
| <span class="sd"> ... 1</span> |
| <span class="sd"> ... 2</span> |
| <span class="sd"> ... 3</span> |
| <span class="sd"> Name: (x, a), dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.sort_values"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.sort_values.html#pyspark.pandas.Series.sort_values">[docs]</a> <span class="k">def</span> <span class="nf">sort_values</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">na_position</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"last"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Sort by the values.</span> |
| |
| <span class="sd"> Sort a Series in ascending or descending order by some criterion.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ascending : bool or list of bool, default True</span> |
| <span class="sd"> Sort ascending vs. descending. Specify list for multiple sort</span> |
| <span class="sd"> orders. If this is a list of bools, must match the length of</span> |
| <span class="sd"> the by.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> if True, perform operation in-place</span> |
| <span class="sd"> na_position : {'first', 'last'}, default 'last'</span> |
| <span class="sd"> `first` puts NaNs at the beginning, `last` puts NaNs at the end</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sorted_obj : Series ordered by values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([np.nan, 1, 3, 10, 5])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> 4 5.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Sort values ascending order (default behaviour)</span> |
| |
| <span class="sd"> >>> s.sort_values(ascending=True)</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 4 5.0</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Sort values descending order</span> |
| |
| <span class="sd"> >>> s.sort_values(ascending=False)</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> 4 5.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Sort values inplace</span> |
| |
| <span class="sd"> >>> s.sort_values(ascending=False, inplace=True)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> 4 5.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Sort values putting NAs first</span> |
| |
| <span class="sd"> >>> s.sort_values(na_position='first')</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 4 5.0</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Sort a series of strings</span> |
| |
| <span class="sd"> >>> s = ps.Series(['z', 'b', 'd', 'a', 'c'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 z</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 d</span> |
| <span class="sd"> 3 a</span> |
| <span class="sd"> 4 c</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> s.sort_values()</span> |
| <span class="sd"> 3 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 4 c</span> |
| <span class="sd"> 2 d</span> |
| <span class="sd"> 0 z</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]]</span><span class="o">.</span><span class="n">_sort</span><span class="p">(</span> |
| <span class="n">by</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">],</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">,</span> <span class="n">na_position</span><span class="o">=</span><span class="n">na_position</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.sort_index"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.sort_index.html#pyspark.pandas.Series.sort_index">[docs]</a> <span class="k">def</span> <span class="nf">sort_index</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">kind</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">na_position</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"last"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Sort object by labels (along an axis)</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : index, columns to direct sorting. Currently, only axis = 0 is supported.</span> |
| <span class="sd"> level : int or level name or list of ints or list of level names</span> |
| <span class="sd"> if not None, sort on values in specified index level(s)</span> |
| <span class="sd"> ascending : boolean, default True</span> |
| <span class="sd"> Sort ascending vs. descending</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> if True, perform operation in-place</span> |
| <span class="sd"> kind : str, default None</span> |
| <span class="sd"> pandas-on-Spark does not allow specifying the sorting algorithm at the moment,</span> |
| <span class="sd"> default None</span> |
| <span class="sd"> na_position : {‘first’, ‘last’}, default ‘last’</span> |
| <span class="sd"> first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for</span> |
| <span class="sd"> MultiIndex.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sorted_obj : Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan])</span> |
| |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> NaN NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.sort_index(ascending=False)</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> NaN NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.sort_index(na_position='first')</span> |
| <span class="sd"> NaN NaN</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.sort_index(inplace=True)</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> NaN NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], name='0')</span> |
| |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> a 0 3</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> b 0 1</span> |
| <span class="sd"> 1 0</span> |
| <span class="sd"> Name: 0, dtype: int64</span> |
| |
| <span class="sd"> >>> df.sort_index(level=1) # doctest: +SKIP</span> |
| <span class="sd"> a 0 3</span> |
| <span class="sd"> b 0 1</span> |
| <span class="sd"> a 1 2</span> |
| <span class="sd"> b 1 0</span> |
| <span class="sd"> Name: 0, dtype: int64</span> |
| |
| <span class="sd"> >>> df.sort_index(level=[1, 0])</span> |
| <span class="sd"> a 0 3</span> |
| <span class="sd"> b 0 1</span> |
| <span class="sd"> a 1 2</span> |
| <span class="sd"> b 1 0</span> |
| <span class="sd"> Name: 0, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]]</span><span class="o">.</span><span class="n">sort_index</span><span class="p">(</span> |
| <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">level</span><span class="o">=</span><span class="n">level</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">,</span> <span class="n">kind</span><span class="o">=</span><span class="n">kind</span><span class="p">,</span> <span class="n">na_position</span><span class="o">=</span><span class="n">na_position</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.swaplevel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.swaplevel.html#pyspark.pandas.Series.swaplevel">[docs]</a> <span class="k">def</span> <span class="nf">swaplevel</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">2</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">copy</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Swap levels i and j in a MultiIndex.</span> |
| <span class="sd"> Default is to swap the two innermost levels of the index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> i, j : int, str</span> |
| <span class="sd"> Level of the indices to be swapped. Can pass level name as string.</span> |
| <span class="sd"> copy : bool, default True</span> |
| <span class="sd"> Whether to copy underlying data. Must be True.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with levels swapped in MultiIndex.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> midx = pd.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names = ['word', 'number'])</span> |
| <span class="sd"> >>> midx # doctest: +SKIP</span> |
| <span class="sd"> MultiIndex([('a', 1),</span> |
| <span class="sd"> ('b', 2)],</span> |
| <span class="sd"> names=['word', 'number'])</span> |
| <span class="sd"> >>> psser = ps.Series(['x', 'y'], index=midx)</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> word number</span> |
| <span class="sd"> a 1 x</span> |
| <span class="sd"> b 2 y</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> psser.swaplevel()</span> |
| <span class="sd"> number word</span> |
| <span class="sd"> 1 a x</span> |
| <span class="sd"> 2 b y</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> psser.swaplevel(0, 1)</span> |
| <span class="sd"> number word</span> |
| <span class="sd"> 1 a x</span> |
| <span class="sd"> 2 b y</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> psser.swaplevel('number', 'word')</span> |
| <span class="sd"> number word</span> |
| <span class="sd"> 1 a x</span> |
| <span class="sd"> 2 b y</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">copy</span> <span class="ow">is</span> <span class="kc">True</span> |
| |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">swaplevel</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.swapaxes"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.swapaxes.html#pyspark.pandas.Series.swapaxes">[docs]</a> <span class="k">def</span> <span class="nf">swapaxes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">:</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">copy</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Interchange axes and swap values axes appropriately.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> i: {0 or 'index', 1 or 'columns'}. The axis to swap.</span> |
| <span class="sd"> j: {0 or 'index', 1 or 'columns'}. The axis to swap.</span> |
| <span class="sd"> copy : bool, default True.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3], index=["x", "y", "z"])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> x 1</span> |
| <span class="sd"> y 2</span> |
| <span class="sd"> z 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>></span> |
| <span class="sd"> >>> psser.swapaxes(0, 0)</span> |
| <span class="sd"> x 1</span> |
| <span class="sd"> y 2</span> |
| <span class="sd"> z 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">copy</span> <span class="ow">is</span> <span class="kc">True</span> |
| |
| <span class="n">i</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">j</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">i</span> <span class="o">==</span> <span class="n">j</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Axis must be 0 for Series"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="Series.add_prefix"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.add_prefix.html#pyspark.pandas.Series.add_prefix">[docs]</a> <span class="k">def</span> <span class="nf">add_prefix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prefix</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Prefix labels with string `prefix`.</span> |
| |
| <span class="sd"> For Series, the row labels are prefixed.</span> |
| <span class="sd"> For DataFrame, the column labels are prefixed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> prefix : str</span> |
| <span class="sd"> The string to add before each label.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> New Series with updated labels.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.add_suffix: Suffix column labels with string `suffix`.</span> |
| <span class="sd"> DataFrame.add_suffix: Suffix column labels with string `suffix`.</span> |
| <span class="sd"> DataFrame.add_prefix: Prefix column labels with string `prefix`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.add_prefix('item_')</span> |
| <span class="sd"> item_0 1</span> |
| <span class="sd"> item_1 2</span> |
| <span class="sd"> item_2 3</span> |
| <span class="sd"> item_3 4</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">prefix</span><span class="p">),</span> <span class="n">index_spark_column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_spark_column_name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index_spark_column</span><span class="p">,</span> <span class="n">index_spark_column_name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)))</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.add_suffix"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.add_suffix.html#pyspark.pandas.Series.add_suffix">[docs]</a> <span class="k">def</span> <span class="nf">add_suffix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">suffix</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Suffix labels with string suffix.</span> |
| |
| <span class="sd"> For Series, the row labels are suffixed.</span> |
| <span class="sd"> For DataFrame, the column labels are suffixed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> suffix : str</span> |
| <span class="sd"> The string to add after each label.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> New Series with updated labels.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.add_prefix: Prefix row labels with string `prefix`.</span> |
| <span class="sd"> DataFrame.add_prefix: Prefix column labels with string `prefix`.</span> |
| <span class="sd"> DataFrame.add_suffix: Suffix column labels with string `suffix`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.add_suffix('_item')</span> |
| <span class="sd"> 0_item 1</span> |
| <span class="sd"> 1_item 2</span> |
| <span class="sd"> 2_item 3</span> |
| <span class="sd"> 3_item 4</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">suffix</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">index_spark_column</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">suffix</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_spark_column_name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index_spark_column</span><span class="p">,</span> <span class="n">index_spark_column_name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)))</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.corr"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.corr.html#pyspark.pandas.Series.corr">[docs]</a> <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"pearson"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute correlation with `other` Series, excluding missing values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series</span> |
| <span class="sd"> method : {'pearson', 'spearman'}</span> |
| <span class="sd"> * pearson : standard correlation coefficient</span> |
| <span class="sd"> * spearman : Spearman rank correlation</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> correlation : float</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'s1': [.2, .0, .6, .2],</span> |
| <span class="sd"> ... 's2': [.3, .6, .0, .1]})</span> |
| <span class="sd"> >>> s1 = df.s1</span> |
| <span class="sd"> >>> s2 = df.s2</span> |
| <span class="sd"> >>> s1.corr(s2, method='pearson') # doctest: +ELLIPSIS</span> |
| <span class="sd"> -0.851064...</span> |
| |
| <span class="sd"> >>> s1.corr(s2, method='spearman') # doctest: +ELLIPSIS</span> |
| <span class="sd"> -0.948683...</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There are behavior differences between pandas-on-Spark and pandas.</span> |
| |
| <span class="sd"> * the `method` argument only accepts 'pearson', 'spearman'</span> |
| <span class="sd"> * the data should not contain NaNs. pandas-on-Spark will return an error.</span> |
| <span class="sd"> * pandas-on-Spark doesn't support the following argument(s).</span> |
| |
| <span class="sd"> * `min_periods` argument is not supported</span> |
| <span class="sd"> """</span> |
| <span class="c1"># This implementation is suboptimal because it computes more than necessary,</span> |
| <span class="c1"># but it should be a start</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"__corr_arg1__"</span><span class="p">,</span> <span class="s2">"__corr_arg2__"</span><span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">assign</span><span class="p">(</span><span class="n">__corr_arg1__</span><span class="o">=</span><span class="bp">self</span><span class="p">,</span> <span class="n">__corr_arg2__</span><span class="o">=</span><span class="n">other</span><span class="p">)[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">columns</span> |
| <span class="n">c</span> <span class="o">=</span> <span class="n">corr</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">c</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span></div> |
| |
| <div class="viewcode-block" id="Series.nsmallest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.nsmallest.html#pyspark.pandas.Series.nsmallest">[docs]</a> <span class="k">def</span> <span class="nf">nsmallest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the smallest `n` elements.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, default 5</span> |
| <span class="sd"> Return this many ascending sorted values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> The `n` smallest values in the Series, sorted in increasing order.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.nlargest: Get the `n` largest elements.</span> |
| <span class="sd"> Series.sort_values: Sort Series by values.</span> |
| <span class="sd"> Series.head: Return the first `n` rows.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Faster than ``.sort_values().head(n)`` for small `n` relative to</span> |
| <span class="sd"> the size of the ``Series`` object.</span> |
| <span class="sd"> In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer,</span> |
| <span class="sd"> the two would have same performance.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [1, 2, 3, 4, np.nan ,6, 7, 8]</span> |
| <span class="sd"> >>> s = ps.Series(data)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> 6 7.0</span> |
| <span class="sd"> 7 8.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> The `n` largest elements where ``n=5`` by default.</span> |
| |
| <span class="sd"> >>> s.nsmallest()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.nsmallest(3)</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">n</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.nlargest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.nlargest.html#pyspark.pandas.Series.nlargest">[docs]</a> <span class="k">def</span> <span class="nf">nlargest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the largest `n` elements.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, default 5</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> The `n` largest values in the Series, sorted in decreasing order.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.nsmallest: Get the `n` smallest elements.</span> |
| <span class="sd"> Series.sort_values: Sort Series by values.</span> |
| <span class="sd"> Series.head: Return the first `n` rows.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Faster than ``.sort_values(ascending=False).head(n)`` for small `n`</span> |
| <span class="sd"> relative to the size of the ``Series`` object.</span> |
| |
| <span class="sd"> In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer,</span> |
| <span class="sd"> the two would have same performance.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [1, 2, 3, 4, np.nan ,6, 7, 8]</span> |
| <span class="sd"> >>> s = ps.Series(data)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> 6 7.0</span> |
| <span class="sd"> 7 8.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> The `n` largest elements where ``n=5`` by default.</span> |
| |
| <span class="sd"> >>> s.nlargest()</span> |
| <span class="sd"> 7 8.0</span> |
| <span class="sd"> 6 7.0</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.nlargest(n=3)</span> |
| <span class="sd"> 7 8.0</span> |
| <span class="sd"> 6 7.0</span> |
| <span class="sd"> 5 6.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">n</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.append"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.append.html#pyspark.pandas.Series.append">[docs]</a> <span class="k">def</span> <span class="nf">append</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">to_append</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">ignore_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">verify_integrity</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Concatenate two or more Series.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> to_append : Series or list/tuple of Series</span> |
| <span class="sd"> ignore_index : boolean, default False</span> |
| <span class="sd"> If True, do not use the index labels.</span> |
| <span class="sd"> verify_integrity : boolean, default False</span> |
| <span class="sd"> If True, raise Exception on creating index with duplicates</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> appended : Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s1 = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s2 = ps.Series([4, 5, 6])</span> |
| <span class="sd"> >>> s3 = ps.Series([4, 5, 6], index=[3,4,5])</span> |
| |
| <span class="sd"> >>> s1.append(s2)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 0 4</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s1.append(s3)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> 4 5</span> |
| <span class="sd"> 5 6</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> With ignore_index set to True:</span> |
| |
| <span class="sd"> >>> s1.append(s2, ignore_index=True)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> 4 5</span> |
| <span class="sd"> 5 6</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">to_append</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">ignore_index</span><span class="p">,</span> <span class="n">verify_integrity</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.sample"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.sample.html#pyspark.pandas.Series.sample">[docs]</a> <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">n</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">frac</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">replace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">random_state</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">n</span><span class="p">,</span> <span class="n">frac</span><span class="o">=</span><span class="n">frac</span><span class="p">,</span> <span class="n">replace</span><span class="o">=</span><span class="n">replace</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="n">random_state</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <span class="n">sample</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">sample</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="Series.hist"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.hist.html#pyspark.pandas.Series.hist">[docs]</a> <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">hist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">plot</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">bins</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">)</span></div> |
| |
| <span class="n">hist</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">PandasOnSparkPlotAccessor</span><span class="o">.</span><span class="n">hist</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="Series.apply"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.apply.html#pyspark.pandas.Series.apply">[docs]</a> <span class="k">def</span> <span class="nf">apply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> <span class="o">**</span><span class="n">kwds</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Invoke function on values of Series.</span> |
| |
| <span class="sd"> Can be a Python function that only works on the Series.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def square(x) -> np.int32:</span> |
| <span class="sd"> ... return x ** 2</span> |
| |
| <span class="sd"> pandas-on-Spark uses return type hint and does not try to infer the type.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> Python function to apply. Note that type hint for return type is required.</span> |
| <span class="sd"> args : tuple</span> |
| <span class="sd"> Positional arguments passed to func after the series value.</span> |
| <span class="sd"> **kwds</span> |
| <span class="sd"> Additional keyword arguments passed to func.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.aggregate : Only perform aggregating type operations.</span> |
| <span class="sd"> Series.transform : Only perform transforming type operations.</span> |
| <span class="sd"> DataFrame.apply : The equivalent function for DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Create a Series with typical summer temperatures for each city.</span> |
| |
| <span class="sd"> >>> s = ps.Series([20, 21, 12],</span> |
| <span class="sd"> ... index=['London', 'New York', 'Helsinki'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> London 20</span> |
| <span class="sd"> New York 21</span> |
| <span class="sd"> Helsinki 12</span> |
| <span class="sd"> dtype: int64</span> |
| |
| |
| <span class="sd"> Square the values by defining a function and passing it as an</span> |
| <span class="sd"> argument to ``apply()``.</span> |
| |
| <span class="sd"> >>> def square(x) -> np.int64:</span> |
| <span class="sd"> ... return x ** 2</span> |
| <span class="sd"> >>> s.apply(square)</span> |
| <span class="sd"> London 400</span> |
| <span class="sd"> New York 441</span> |
| <span class="sd"> Helsinki 144</span> |
| <span class="sd"> dtype: int64</span> |
| |
| |
| <span class="sd"> Define a custom function that needs additional positional</span> |
| <span class="sd"> arguments and pass these additional arguments using the</span> |
| <span class="sd"> ``args`` keyword</span> |
| |
| <span class="sd"> >>> def subtract_custom_value(x, custom_value) -> np.int64:</span> |
| <span class="sd"> ... return x - custom_value</span> |
| |
| <span class="sd"> >>> s.apply(subtract_custom_value, args=(5,))</span> |
| <span class="sd"> London 15</span> |
| <span class="sd"> New York 16</span> |
| <span class="sd"> Helsinki 7</span> |
| <span class="sd"> dtype: int64</span> |
| |
| |
| <span class="sd"> Define a custom function that takes keyword arguments</span> |
| <span class="sd"> and pass these arguments to ``apply``</span> |
| |
| <span class="sd"> >>> def add_custom_values(x, **kwargs) -> np.int64:</span> |
| <span class="sd"> ... for month in kwargs:</span> |
| <span class="sd"> ... x += kwargs[month]</span> |
| <span class="sd"> ... return x</span> |
| |
| <span class="sd"> >>> s.apply(add_custom_values, june=30, july=20, august=25)</span> |
| <span class="sd"> London 95</span> |
| <span class="sd"> New York 96</span> |
| <span class="sd"> Helsinki 87</span> |
| <span class="sd"> dtype: int64</span> |
| |
| |
| <span class="sd"> Use a function from the Numpy library</span> |
| |
| <span class="sd"> >>> def numpy_log(col) -> np.float64:</span> |
| <span class="sd"> ... return np.log(col)</span> |
| <span class="sd"> >>> s.apply(numpy_log)</span> |
| <span class="sd"> London 2.995732</span> |
| <span class="sd"> New York 3.044522</span> |
| <span class="sd"> Helsinki 2.484907</span> |
| <span class="sd"> dtype: float64</span> |
| |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> s.apply(np.log)</span> |
| <span class="sd"> London 2.995732</span> |
| <span class="sd"> New York 3.044522</span> |
| <span class="sd"> Helsinki 2.484907</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">),</span> <span class="s2">"the first argument should be a callable function."</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> |
| <span class="c1"># Falls back to schema inference if it fails to get signature.</span> |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="kc">True</span> |
| |
| <span class="k">def</span> <span class="nf">apply_each</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">_transform_batch</span><span class="p">(</span><span class="n">apply_each</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sig_return</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">sig_return</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Expected the return type of this function to be of scalar type, "</span> |
| <span class="s2">"but found type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">sig_return</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">sig_return</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">_transform_batch</span><span class="p">(</span><span class="n">apply_each</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: not all arguments are implemented comparing to pandas' for now.</span> |
| <div class="viewcode-block" id="Series.aggregate"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.aggregate.html#pyspark.pandas.Series.aggregate">[docs]</a> <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""Aggregate using one or more operations over the specified axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : str or a list of str</span> |
| <span class="sd"> function name(s) as string apply to series.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar, Series</span> |
| <span class="sd"> The return can be:</span> |
| <span class="sd"> - scalar : when Series.agg is called with single function</span> |
| <span class="sd"> - Series : when Series.agg is called with several functions</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `agg` is an alias for `aggregate`. Use the alias.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.apply : Invoke function on a Series.</span> |
| <span class="sd"> Series.transform : Only perform transforming type operations.</span> |
| <span class="sd"> Series.groupby : Perform operations over groups.</span> |
| <span class="sd"> DataFrame.aggregate : The equivalent function for DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4])</span> |
| <span class="sd"> >>> s.agg('min')</span> |
| <span class="sd"> 1</span> |
| |
| <span class="sd"> >>> s.agg(['min', 'max']).sort_index()</span> |
| <span class="sd"> max 4</span> |
| <span class="sd"> min 1</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="n">func</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">return</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">)()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"func must be a string or list of strings"</span><span class="p">)</span></div> |
| |
| <span class="n">agg</span> <span class="o">=</span> <span class="n">aggregate</span> |
| |
| <span class="k">def</span> <span class="nf">transpose</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the transpose, which is by definition self.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> It returns the same object as the transpose of the given series object, which is by</span> |
| <span class="sd"> definition self.</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.transpose()</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="n">T</span> <span class="o">=</span> <span class="nb">property</span><span class="p">(</span><span class="n">transpose</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.transform"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.transform.html#pyspark.pandas.Series.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">]],</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Call ``func`` producing the same type as `self` with transformed values</span> |
| <span class="sd"> and that has the same axis length as input.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def square(x) -> np.int32:</span> |
| <span class="sd"> ... return x ** 2</span> |
| |
| <span class="sd"> pandas-on-Spark uses return type hint and does not try to infer the type.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function or list</span> |
| <span class="sd"> A function or a list of functions to use for transforming the data.</span> |
| <span class="sd"> axis : int, default 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to `func`.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to `func`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> An instance of the same type with `self` that must have the same length as input.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.aggregate : Only perform aggregating type operations.</span> |
| <span class="sd"> Series.apply : Invoke function on Series.</span> |
| <span class="sd"> DataFrame.transform : The equivalent function for DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> s = ps.Series(range(3))</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> def sqrt(x) -> float:</span> |
| <span class="sd"> ... return np.sqrt(x)</span> |
| <span class="sd"> >>> s.transform(sqrt)</span> |
| <span class="sd"> 0 0.000000</span> |
| <span class="sd"> 1 1.000000</span> |
| <span class="sd"> 2 1.414214</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Even though the resulting instance must have the same length as the</span> |
| <span class="sd"> input, it is possible to provide several input functions:</span> |
| |
| <span class="sd"> >>> def exp(x) -> float:</span> |
| <span class="sd"> ... return np.exp(x)</span> |
| <span class="sd"> >>> s.transform([sqrt, exp])</span> |
| <span class="sd"> sqrt exp</span> |
| <span class="sd"> 0 0.000000 1.000000</span> |
| <span class="sd"> 1 1.000000 2.718282</span> |
| <span class="sd"> 2 1.414214 7.389056</span> |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> s.transform([np.sqrt, np.exp])</span> |
| <span class="sd"> sqrt exp</span> |
| <span class="sd"> 0 0.000000 1.000000</span> |
| <span class="sd"> 1 1.000000 2.718282</span> |
| <span class="sd"> 2 1.414214 7.389056</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">func</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.round"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.round.html#pyspark.pandas.Series.round">[docs]</a> <span class="k">def</span> <span class="nf">round</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">decimals</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Round each value in a Series to the given number of decimals.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> decimals : int</span> |
| <span class="sd"> Number of decimal places to round to (default: 0).</span> |
| <span class="sd"> If decimals is negative, it specifies the number of</span> |
| <span class="sd"> positions to the left of the decimal point.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series object</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.round</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.Series([0.028208, 0.038683, 0.877076], name='x')</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> 0 0.028208</span> |
| <span class="sd"> 1 0.038683</span> |
| <span class="sd"> 2 0.877076</span> |
| <span class="sd"> Name: x, dtype: float64</span> |
| |
| <span class="sd"> >>> df.round(2)</span> |
| <span class="sd"> 0 0.03</span> |
| <span class="sd"> 1 0.04</span> |
| <span class="sd"> 2 0.88</span> |
| <span class="sd"> Name: x, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">decimals</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"decimals must be an integer"</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">decimals</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span> |
| <span class="n">scol</span><span class="p">,</span> |
| <span class="n">field</span><span class="o">=</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">DecimalType</span><span class="p">)</span> |
| <span class="k">else</span> <span class="kc">None</span> |
| <span class="p">),</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: add 'interpolation' parameter.</span> |
| <div class="viewcode-block" id="Series.quantile"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.quantile.html#pyspark.pandas.Series.quantile">[docs]</a> <span class="k">def</span> <span class="nf">quantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">q</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return value at the given quantile.</span> |
| |
| <span class="sd"> .. note:: Unlike pandas', the quantile in pandas-on-Spark is an approximated quantile</span> |
| <span class="sd"> based upon approximate percentile computation because computing quantile across</span> |
| <span class="sd"> a large dataset is extremely expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> q : float or array-like, default 0.5 (50% quantile)</span> |
| <span class="sd"> 0 <= q <= 1, the quantile(s) to compute.</span> |
| <span class="sd"> accuracy : int, optional</span> |
| <span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span> |
| <span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> float or Series</span> |
| <span class="sd"> If the current object is a Series and ``q`` is an array, a Series will be</span> |
| <span class="sd"> returned where the index is ``q`` and the values are the quantiles, otherwise</span> |
| <span class="sd"> a float will be returned.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4, 5])</span> |
| <span class="sd"> >>> s.quantile(.5)</span> |
| <span class="sd"> 3.0</span> |
| |
| <span class="sd"> >>> (s + 1).quantile(.5)</span> |
| <span class="sd"> 4.0</span> |
| |
| <span class="sd"> >>> s.quantile([.25, .5, .75])</span> |
| <span class="sd"> 0.25 2.0</span> |
| <span class="sd"> 0.50 3.0</span> |
| <span class="sd"> 0.75 4.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> (s + 1).quantile([.25, .5, .75])</span> |
| <span class="sd"> 0.25 3.0</span> |
| <span class="sd"> 0.50 4.0</span> |
| <span class="sd"> 0.75 5.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="n">cast</span><span class="p">(</span> |
| <span class="s2">"ps.DataFrame"</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">quantile</span><span class="p">(</span><span class="n">q</span><span class="o">=</span><span class="n">q</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">accuracy</span><span class="o">=</span><span class="n">accuracy</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="nb">float</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"q must be a float or an array of floats; however, [</span><span class="si">%s</span><span class="s2">] found."</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">q</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">q_float</span> <span class="o">=</span> <span class="n">q</span> |
| <span class="k">if</span> <span class="n">q_float</span> <span class="o"><</span> <span class="mf">0.0</span> <span class="ow">or</span> <span class="n">q_float</span> <span class="o">></span> <span class="mf">1.0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"percentiles should all be in the interval [0, 1]."</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">quantile</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="p">(</span><span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="n">q_float</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">quantile</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"quantile"</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: add axis, numeric_only, pct, na_option parameter</span> |
| <div class="viewcode-block" id="Series.rank"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.rank.html#pyspark.pandas.Series.rank">[docs]</a> <span class="k">def</span> <span class="nf">rank</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"average"</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute numerical data ranks (1 through n) along axis. Equal values are</span> |
| <span class="sd"> assigned a rank that is the average of the ranks of those values.</span> |
| |
| <span class="sd"> .. note:: the current implementation of rank uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> method : {'average', 'min', 'max', 'first', 'dense'}</span> |
| <span class="sd"> * average: average rank of group</span> |
| <span class="sd"> * min: lowest rank in group</span> |
| <span class="sd"> * max: highest rank in group</span> |
| <span class="sd"> * first: ranks assigned in order they appear in the array</span> |
| <span class="sd"> * dense: like 'min', but rank always increases by 1 between groups</span> |
| <span class="sd"> ascending : boolean, default True</span> |
| <span class="sd"> False for ranks by high (1) to low (N)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> ranks : same type as caller</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 2, 3], name='A')</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> Name: A, dtype: int64</span> |
| |
| <span class="sd"> >>> s.rank()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.5</span> |
| <span class="sd"> 2 2.5</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| |
| <span class="sd"> If method is set to 'min', it use lowest rank in group.</span> |
| |
| <span class="sd"> >>> s.rank(method='min')</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| |
| <span class="sd"> If method is set to 'max', it use highest rank in group.</span> |
| |
| <span class="sd"> >>> s.rank(method='max')</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 3.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| |
| <span class="sd"> If method is set to 'first', it is assigned rank in order without groups.</span> |
| |
| <span class="sd"> >>> s.rank(method='first')</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 4.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| |
| <span class="sd"> If method is set to 'dense', it leaves no gaps in group.</span> |
| |
| <span class="sd"> >>> s.rank(method='dense')</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 3.0</span> |
| <span class="sd"> Name: A, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rank</span><span class="p">(</span><span class="n">method</span><span class="p">,</span> <span class="n">ascending</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">analyzed</span></div> |
| |
| <span class="k">def</span> <span class="nf">_rank</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"average"</span><span class="p">,</span> |
| <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">method</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"average"</span><span class="p">,</span> <span class="s2">"min"</span><span class="p">,</span> <span class="s2">"max"</span><span class="p">,</span> <span class="s2">"first"</span><span class="p">,</span> <span class="s2">"dense"</span><span class="p">]:</span> |
| <span class="n">msg</span> <span class="o">=</span> <span class="s2">"method must be one of 'average', 'min', 'max', 'first', 'dense'"</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"rank do not support MultiIndex now"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">ascending</span><span class="p">:</span> |
| <span class="n">asc_func</span> <span class="o">=</span> <span class="n">Column</span><span class="o">.</span><span class="n">asc</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">asc_func</span> <span class="o">=</span> <span class="n">Column</span><span class="o">.</span><span class="n">desc</span> |
| |
| <span class="k">if</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"first"</span><span class="p">:</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">asc_func</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span> |
| <span class="n">asc_func</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)),</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"dense"</span><span class="p">:</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">asc_func</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">dense_rank</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"average"</span><span class="p">:</span> |
| <span class="n">stat_func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">mean</span> |
| <span class="k">elif</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"min"</span><span class="p">:</span> |
| <span class="n">stat_func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span> |
| <span class="k">elif</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"max"</span><span class="p">:</span> |
| <span class="n">stat_func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span> |
| <span class="n">window1</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">asc_func</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">window2</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span> |
| <span class="n">cast</span><span class="p">(</span><span class="s2">"List[ColumnOrName]"</span><span class="p">,</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">])</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">unboundedFollowing</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">stat_func</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window1</span><span class="p">))</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window2</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()))</span> |
| |
| <div class="viewcode-block" id="Series.filter"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.filter.html#pyspark.pandas.Series.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">items</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">like</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">regex</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Series does not support columns axis."</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">items</span><span class="o">=</span><span class="n">items</span><span class="p">,</span> <span class="n">like</span><span class="o">=</span><span class="n">like</span><span class="p">,</span> <span class="n">regex</span><span class="o">=</span><span class="n">regex</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <span class="nb">filter</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">filter</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="Series.describe"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.describe.html#pyspark.pandas.Series.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">percentiles</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">describe</span><span class="p">(</span><span class="n">percentiles</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <span class="n">describe</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">describe</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="Series.diff"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.diff.html#pyspark.pandas.Series.diff">[docs]</a> <span class="k">def</span> <span class="nf">diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> First discrete difference of element.</span> |
| |
| <span class="sd"> Calculates the difference of a Series element compared with another element in the</span> |
| <span class="sd"> DataFrame (default is the element in the same column of the previous row).</span> |
| |
| <span class="sd"> .. note:: the current implementation of diff uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int, default 1</span> |
| <span class="sd"> Periods to shift for calculating difference, accepts negative values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> diffed : Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'b': [1, 1, 2, 3, 5, 8],</span> |
| <span class="sd"> ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 1 1</span> |
| <span class="sd"> 1 2 1 4</span> |
| <span class="sd"> 2 3 2 9</span> |
| <span class="sd"> 3 4 3 16</span> |
| <span class="sd"> 4 5 5 25</span> |
| <span class="sd"> 5 6 8 36</span> |
| |
| <span class="sd"> >>> df.b.diff()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.0</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 2.0</span> |
| <span class="sd"> 5 3.0</span> |
| <span class="sd"> Name: b, dtype: float64</span> |
| |
| <span class="sd"> Difference with previous value</span> |
| |
| <span class="sd"> >>> df.c.diff(periods=3)</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 NaN</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 15.0</span> |
| <span class="sd"> 4 21.0</span> |
| <span class="sd"> 5 27.0</span> |
| <span class="sd"> Name: c, dtype: float64</span> |
| |
| <span class="sd"> Difference with following value</span> |
| |
| <span class="sd"> >>> df.c.diff(periods=-1)</span> |
| <span class="sd"> 0 -3.0</span> |
| <span class="sd"> 1 -5.0</span> |
| <span class="sd"> 2 -7.0</span> |
| <span class="sd"> 3 -9.0</span> |
| <span class="sd"> 4 -11.0</span> |
| <span class="sd"> 5 NaN</span> |
| <span class="sd"> Name: c, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_diff</span><span class="p">(</span><span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">analyzed</span></div> |
| |
| <span class="k">def</span> <span class="nf">_diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="p">())</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"periods should be an int; however, got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="n">periods</span><span class="p">,</span> <span class="o">-</span><span class="n">periods</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="o">-</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span> |
| |
| <div class="viewcode-block" id="Series.idxmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.idxmax.html#pyspark.pandas.Series.idxmax">[docs]</a> <span class="k">def</span> <span class="nf">idxmax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Tuple</span><span class="p">,</span> <span class="n">Any</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the row label of the maximum value.</span> |
| |
| <span class="sd"> If multiple values equal the maximum, the first row label with that</span> |
| <span class="sd"> value is returned.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : bool, default True</span> |
| <span class="sd"> Exclude NA/null values. If the entire Series is NA, the result</span> |
| <span class="sd"> will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Index</span> |
| <span class="sd"> Label of the maximum value.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> ValueError</span> |
| <span class="sd"> If the Series is empty.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmin : Return index *label* of the first occurrence</span> |
| <span class="sd"> of minimum of values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(data=[1, None, 4, 3, 5],</span> |
| <span class="sd"> ... index=['A', 'B', 'C', 'D', 'E'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> A 1.0</span> |
| <span class="sd"> B NaN</span> |
| <span class="sd"> C 4.0</span> |
| <span class="sd"> D 3.0</span> |
| <span class="sd"> E 5.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.idxmax()</span> |
| <span class="sd"> 'E'</span> |
| |
| <span class="sd"> If `skipna` is False and there is an NA value in the data,</span> |
| <span class="sd"> the function returns ``nan``.</span> |
| |
| <span class="sd"> >>> s.idxmax(skipna=False)</span> |
| <span class="sd"> nan</span> |
| |
| <span class="sd"> In case of multi-index, you get a tuple:</span> |
| |
| <span class="sd"> >>> index = pd.MultiIndex.from_arrays([</span> |
| <span class="sd"> ... ['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second'))</span> |
| <span class="sd"> >>> s = ps.Series(data=[1, None, 4, 5], index=index)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> first second</span> |
| <span class="sd"> a c 1.0</span> |
| <span class="sd"> d NaN</span> |
| <span class="sd"> b e 4.0</span> |
| <span class="sd"> f 5.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.idxmax()</span> |
| <span class="sd"> ('b', 'f')</span> |
| |
| <span class="sd"> If multiple values equal the maximum, the first row label with that</span> |
| <span class="sd"> value is returned.</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 100, 1, 100, 1, 100], index=[10, 3, 5, 2, 1, 8])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 10 1</span> |
| <span class="sd"> 3 100</span> |
| <span class="sd"> 5 1</span> |
| <span class="sd"> 2 100</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 8 100</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.idxmax()</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">(),</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">(),</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">results</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="n">scol</span><span class="p">]</span> <span class="o">+</span> <span class="n">index_scols</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">results</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"attempt to get idxmin of an empty sequence"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">results</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="c1"># This will only happens when skipna is False because we will</span> |
| <span class="c1"># place nulls first.</span> |
| <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">results</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">1</span><span class="p">:])</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">values</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.idxmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.idxmin.html#pyspark.pandas.Series.idxmin">[docs]</a> <span class="k">def</span> <span class="nf">idxmin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Tuple</span><span class="p">,</span> <span class="n">Any</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the row label of the minimum value.</span> |
| |
| <span class="sd"> If multiple values equal the minimum, the first row label with that</span> |
| <span class="sd"> value is returned.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : bool, default True</span> |
| <span class="sd"> Exclude NA/null values. If the entire Series is NA, the result</span> |
| <span class="sd"> will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Index</span> |
| <span class="sd"> Label of the minimum value.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> ValueError</span> |
| <span class="sd"> If the Series is empty.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmax : Return index *label* of the first occurrence</span> |
| <span class="sd"> of maximum of values.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method is the Series version of ``ndarray.argmin``. This method</span> |
| <span class="sd"> returns the label of the minimum, while ``ndarray.argmin`` returns</span> |
| <span class="sd"> the position. To get the position, use ``series.values.argmin()``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(data=[1, None, 4, 0],</span> |
| <span class="sd"> ... index=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> A 1.0</span> |
| <span class="sd"> B NaN</span> |
| <span class="sd"> C 4.0</span> |
| <span class="sd"> D 0.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.idxmin()</span> |
| <span class="sd"> 'D'</span> |
| |
| <span class="sd"> If `skipna` is False and there is an NA value in the data,</span> |
| <span class="sd"> the function returns ``nan``.</span> |
| |
| <span class="sd"> >>> s.idxmin(skipna=False)</span> |
| <span class="sd"> nan</span> |
| |
| <span class="sd"> In case of multi-index, you get a tuple:</span> |
| |
| <span class="sd"> >>> index = pd.MultiIndex.from_arrays([</span> |
| <span class="sd"> ... ['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second'))</span> |
| <span class="sd"> >>> s = ps.Series(data=[1, None, 4, 0], index=index)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> first second</span> |
| <span class="sd"> a c 1.0</span> |
| <span class="sd"> d NaN</span> |
| <span class="sd"> b e 4.0</span> |
| <span class="sd"> f 0.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.idxmin()</span> |
| <span class="sd"> ('b', 'f')</span> |
| |
| <span class="sd"> If multiple values equal the minimum, the first row label with that</span> |
| <span class="sd"> value is returned.</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 100, 1, 100, 1, 100], index=[10, 3, 5, 2, 1, 8])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 10 1</span> |
| <span class="sd"> 3 100</span> |
| <span class="sd"> 5 1</span> |
| <span class="sd"> 2 100</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 8 100</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.idxmin()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> """</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">(),</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">(),</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">results</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="n">scol</span><span class="p">]</span> <span class="o">+</span> <span class="n">index_scols</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">results</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"attempt to get idxmin of an empty sequence"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">results</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="c1"># This will only happens when skipna is False because we will</span> |
| <span class="c1"># place nulls first.</span> |
| <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">results</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">1</span><span class="p">:])</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">values</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.pop"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.pop.html#pyspark.pandas.Series.pop">[docs]</a> <span class="k">def</span> <span class="nf">pop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Name</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Scalar</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return item and drop from series.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> item : label</span> |
| <span class="sd"> Label of index to be popped.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Value that is popped from series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(data=np.arange(3), index=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> A 0</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.pop('A')</span> |
| <span class="sd"> 0</span> |
| |
| <span class="sd"> >>> s</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s = ps.Series(data=np.arange(3), index=['A', 'A', 'C'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> A 0</span> |
| <span class="sd"> A 1</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.pop('A')</span> |
| <span class="sd"> A 0</span> |
| <span class="sd"> A 1</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s</span> |
| <span class="sd"> C 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Also support for MultiIndex</span> |
| |
| <span class="sd"> >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],</span> |
| <span class="sd"> ... index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> lama speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.pop('lama')</span> |
| <span class="sd"> speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Also support for MultiIndex with several indexs.</span> |
| |
| <span class="sd"> >>> midx = pd.MultiIndex([['a', 'b', 'c'],</span> |
| <span class="sd"> ... ['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 0, 0, 0, 1, 1, 1],</span> |
| <span class="sd"> ... [0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 0, 2]]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],</span> |
| <span class="sd"> ... index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> a lama speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> b falcon speed 320.0</span> |
| <span class="sd"> speed 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.pop(('a', 'lama'))</span> |
| <span class="sd"> speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s</span> |
| <span class="sd"> a cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> b falcon speed 320.0</span> |
| <span class="sd"> speed 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.pop(('b', 'falcon', 'speed'))</span> |
| <span class="sd"> (b, falcon, speed) 320.0</span> |
| <span class="sd"> (b, falcon, speed) 1.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"'key' should be string or tuple that contains strings"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">(</span><span class="n">item</span><span class="p">,)</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Key length (</span><span class="si">{}</span><span class="s2">) exceeds index depth (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="n">scols</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="p">:]</span> <span class="o">+</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">]</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="p">[</span><span class="n">internal</span><span class="o">.</span><span class="n">spark_columns</span><span class="p">[</span><span class="n">level</span><span class="p">]</span> <span class="o">==</span> <span class="n">index</span> <span class="k">for</span> <span class="n">level</span><span class="p">,</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">item</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> <span class="n">rows</span><span class="p">))</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">scols</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_drop</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="c1"># if spark_frame has one column and one data, return data only without frame</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| <span class="n">length</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">length</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">val</span> <span class="o">=</span> <span class="n">pdf</span><span class="p">[</span><span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">CategoricalDtype</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">categories</span><span class="p">[</span><span class="n">val</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">val</span> |
| |
| <span class="n">item_string</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">item_string</span><span class="p">)))</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="p">:],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="p">:],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.copy"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.copy.html#pyspark.pandas.Series.copy">[docs]</a> <span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">deep</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Make a copy of this object's indices and data.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> deep : bool, default True</span> |
| <span class="sd"> this parameter is not supported but just dummy parameter to match pandas.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> copy : Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2], index=["a", "b"])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> a 1</span> |
| <span class="sd"> b 2</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>> s_copy = s.copy()</span> |
| <span class="sd"> >>> s_copy</span> |
| <span class="sd"> a 1</span> |
| <span class="sd"> b 2</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.mode"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.mode.html#pyspark.pandas.Series.mode">[docs]</a> <span class="k">def</span> <span class="nf">mode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the mode(s) of the dataset.</span> |
| |
| <span class="sd"> Always returns Series even if only one value is returned.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dropna : bool, default True</span> |
| <span class="sd"> Don't consider counts of NaN/NaT.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Modes of the Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([0, 0, 1, 1, 1, np.nan, np.nan, np.nan])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 0.0</span> |
| <span class="sd"> 1 0.0</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 1.0</span> |
| <span class="sd"> 5 NaN</span> |
| <span class="sd"> 6 NaN</span> |
| <span class="sd"> 7 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.mode()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> If there are several same modes, all items are shown</span> |
| |
| <span class="sd"> >>> s = ps.Series([0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,</span> |
| <span class="sd"> ... np.nan, np.nan, np.nan])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 0.0</span> |
| <span class="sd"> 1 0.0</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 1.0</span> |
| <span class="sd"> 5 2.0</span> |
| <span class="sd"> 6 2.0</span> |
| <span class="sd"> 7 2.0</span> |
| <span class="sd"> 8 3.0</span> |
| <span class="sd"> 9 3.0</span> |
| <span class="sd"> 10 3.0</span> |
| <span class="sd"> 11 NaN</span> |
| <span class="sd"> 12 NaN</span> |
| <span class="sd"> 13 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.mode().sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS</span> |
| <span class="sd"> <BLANKLINE></span> |
| <span class="sd"> ... 1.0</span> |
| <span class="sd"> ... 2.0</span> |
| <span class="sd"> ... 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> With 'dropna' set to 'False', we can also see NaN in the result</span> |
| |
| <span class="sd"> >>> s.mode(False).sort_values() # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS</span> |
| <span class="sd"> <BLANKLINE></span> |
| <span class="sd"> ... 1.0</span> |
| <span class="sd"> ... 2.0</span> |
| <span class="sd"> ... 3.0</span> |
| <span class="sd"> ... NaN</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="n">ser_count</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">value_counts</span><span class="p">(</span><span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> <span class="n">sort</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="n">sdf_count</span> <span class="o">=</span> <span class="n">ser_count</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">most_value</span> <span class="o">=</span> <span class="n">ser_count</span><span class="o">.</span><span class="n">max</span><span class="p">()</span> |
| <span class="n">sdf_most_value</span> <span class="o">=</span> <span class="n">sdf_count</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="s2">"count == </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">most_value</span><span class="p">))</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf_most_value</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span><span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">])</span> |
| |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.keys"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.keys.html#pyspark.pandas.Series.keys">[docs]</a> <span class="k">def</span> <span class="nf">keys</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ps.Index"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return alias for index.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Index</span> |
| <span class="sd"> Index of the Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> psser = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)</span> |
| |
| <span class="sd"> >>> psser.keys() # doctest: +SKIP</span> |
| <span class="sd"> MultiIndex([( 'lama', 'speed'),</span> |
| <span class="sd"> ( 'lama', 'weight'),</span> |
| <span class="sd"> ( 'lama', 'length'),</span> |
| <span class="sd"> ( 'cow', 'speed'),</span> |
| <span class="sd"> ( 'cow', 'weight'),</span> |
| <span class="sd"> ( 'cow', 'length'),</span> |
| <span class="sd"> ('falcon', 'speed'),</span> |
| <span class="sd"> ('falcon', 'weight'),</span> |
| <span class="sd"> ('falcon', 'length')],</span> |
| <span class="sd"> )</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span></div> |
| |
| <span class="c1"># TODO: 'regex', 'method' parameter</span> |
| <div class="viewcode-block" id="Series.replace"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.replace.html#pyspark.pandas.Series.replace">[docs]</a> <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">regex</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Replace values given in to_replace with value.</span> |
| <span class="sd"> Values of the Series are replaced with other values dynamically.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> to_replace : str, list, tuple, dict, Series, int, float, or None</span> |
| <span class="sd"> How to find the values that will be replaced.</span> |
| <span class="sd"> * numeric, str:</span> |
| |
| <span class="sd"> - numeric: numeric values equal to to_replace will be replaced with value</span> |
| <span class="sd"> - str: string exactly matching to_replace will be replaced with value</span> |
| |
| <span class="sd"> * list of str or numeric:</span> |
| |
| <span class="sd"> - if to_replace and value are both lists or tuples, they must be the same length.</span> |
| <span class="sd"> - str and numeric rules apply as above.</span> |
| |
| <span class="sd"> * dict:</span> |
| |
| <span class="sd"> - Dicts can be used to specify different replacement values for different</span> |
| <span class="sd"> existing values.</span> |
| <span class="sd"> For example, {'a': 'b', 'y': 'z'} replaces the value ‘a’ with ‘b’ and ‘y’</span> |
| <span class="sd"> with ‘z’. To use a dict in this way the value parameter should be None.</span> |
| <span class="sd"> - For a DataFrame a dict can specify that different values should be replaced</span> |
| <span class="sd"> in different columns. For example, {'a': 1, 'b': 'z'} looks for the value 1</span> |
| <span class="sd"> in column ‘a’ and the value ‘z’ in column ‘b’ and replaces these values with</span> |
| <span class="sd"> whatever is specified in value.</span> |
| <span class="sd"> The value parameter should not be None in this case.</span> |
| <span class="sd"> You can treat this as a special case of passing two lists except that you are</span> |
| <span class="sd"> specifying the column to search in.</span> |
| |
| <span class="sd"> See the examples section for examples of each of these.</span> |
| |
| <span class="sd"> value : scalar, dict, list, tuple, str default None</span> |
| <span class="sd"> Value to replace any values matching to_replace with.</span> |
| <span class="sd"> For a DataFrame a dict of values can be used to specify which value to use</span> |
| <span class="sd"> for each column (columns not in the dict will not be filled).</span> |
| <span class="sd"> Regular expressions, strings and lists or dicts of such objects are also allowed.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Object after replacement.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> Scalar `to_replace` and `value`</span> |
| |
| <span class="sd"> >>> s = ps.Series([0, 1, 2, 3, 4])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.replace(0, 5)</span> |
| <span class="sd"> 0 5</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> List-like `to_replace`</span> |
| |
| <span class="sd"> >>> s.replace([0, 4], 5000)</span> |
| <span class="sd"> 0 5000</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 5000</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.replace([1, 2, 3], [10, 20, 30])</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 10</span> |
| <span class="sd"> 2 20</span> |
| <span class="sd"> 3 30</span> |
| <span class="sd"> 4 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Dict-like `to_replace`</span> |
| |
| <span class="sd"> >>> s.replace({1: 1000, 2: 2000, 3: 3000, 4: 4000})</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1000</span> |
| <span class="sd"> 2 2000</span> |
| <span class="sd"> 3 3000</span> |
| <span class="sd"> 4 4000</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Also support for MultiIndex</span> |
| |
| <span class="sd"> >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],</span> |
| <span class="sd"> ... index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> lama speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.replace(45, 450)</span> |
| <span class="sd"> lama speed 450.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.replace([45, 30, 320], 500)</span> |
| <span class="sd"> lama speed 500.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 500.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 500.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.replace({45: 450, 30: 300})</span> |
| <span class="sd"> lama speed 450.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> cow speed 300.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">to_replace</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"ffill"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">,</span> <span class="nb">dict</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"'to_replace' should be one of str, list, tuple, dict, int, float"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">regex</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"replace currently not support for regex"</span><span class="p">)</span> |
| <span class="n">to_replace</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">else</span> <span class="n">to_replace</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Replacement lists must match in length. Expecting </span><span class="si">{}</span><span class="s2"> got </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">to_replace</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span><span class="p">)}</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">is_start</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">to_replace_</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">to_replace</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">isnan</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> <span class="o">|</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNull</span><span class="p">())</span> |
| <span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">to_replace_</span><span class="p">)</span> |
| <span class="k">else</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">to_replace_</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_start</span><span class="p">:</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> |
| <span class="n">is_start</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">current</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)</span> |
| <span class="c1"># to_replace may be a scalar</span> |
| <span class="k">if</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">to_replace</span><span class="p">))</span><span class="o">.</span><span class="n">any</span><span class="p">():</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span> <span class="o">|</span> <span class="n">F</span><span class="o">.</span><span class="n">isnan</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> <span class="o">|</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> |
| <span class="n">current</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">current</span><span class="p">)</span> <span class="c1"># TODO: dtype?</span></div> |
| |
| <div class="viewcode-block" id="Series.update"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.update.html#pyspark.pandas.Series.update">[docs]</a> <span class="k">def</span> <span class="nf">update</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Modify Series in place using non-NA values from passed Series. Aligns on index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s.update(ps.Series([4, 5, 6]))</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 0 4</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s = ps.Series(['a', 'b', 'c'])</span> |
| <span class="sd"> >>> s.update(ps.Series(['d', 'e'], index=[0, 2]))</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 0 d</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 e</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s.update(ps.Series([4, 5, 6, 7, 8]))</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 0 4</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3], index=[10, 11, 12])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 10 1</span> |
| <span class="sd"> 11 2</span> |
| <span class="sd"> 12 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.update(ps.Series([4, 5, 6]))</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 10 1</span> |
| <span class="sd"> 11 2</span> |
| <span class="sd"> 12 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.update(ps.Series([4, 5, 6], index=[11, 12, 13]))</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 10 1</span> |
| <span class="sd"> 11 4</span> |
| <span class="sd"> 12 5</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> If ``other`` contains NaNs the corresponding values are not updated</span> |
| <span class="sd"> in the original Series.</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s.update(ps.Series([4, np.nan, 6]))</span> |
| <span class="sd"> >>> s.sort_index()</span> |
| <span class="sd"> 0 4.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 6.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"'other' must be a Series"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">other</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_spark_column</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">,</span> <span class="n">scol</span> <span class="c1"># TODO: dtype?</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">"leftouter"</span><span class="p">)</span> |
| |
| <span class="n">this_scol</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"this"</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="n">that_scol</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"that"</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">that_scol</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">that_scol</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">this_scol</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"this"</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_spark_column</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">,</span> <span class="n">scol</span> <span class="c1"># TODO: dtype?</span> |
| <span class="p">)</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">,</span> <span class="n">requires_same_anchor</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.where"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.where.html#pyspark.pandas.Series.where">[docs]</a> <span class="k">def</span> <span class="nf">where</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cond</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Replace values where the condition is False.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cond : boolean Series</span> |
| <span class="sd"> Where cond is True, keep the original value. Where False,</span> |
| <span class="sd"> replace with corresponding value from other.</span> |
| <span class="sd"> other : scalar, Series</span> |
| <span class="sd"> Entries where cond is False are replaced with corresponding value from other.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> s1 = ps.Series([0, 1, 2, 3, 4])</span> |
| <span class="sd"> >>> s2 = ps.Series([100, 200, 300, 400, 500])</span> |
| <span class="sd"> >>> s1.where(s1 > 0).sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 3.0</span> |
| <span class="sd"> 4 4.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s1.where(s1 > 1, 10).sort_index()</span> |
| <span class="sd"> 0 10</span> |
| <span class="sd"> 1 10</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s1.where(s1 > 1, s1 + 100).sort_index()</span> |
| <span class="sd"> 0 100</span> |
| <span class="sd"> 1 101</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s1.where(s1 > 1, s2).sort_index()</span> |
| <span class="sd"> 0 100</span> |
| <span class="sd"> 1 200</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> |
| |
| <span class="c1"># We should check the DataFrame from both `cond` and `other`.</span> |
| <span class="n">should_try_ops_on_diff_frame</span> <span class="o">=</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> <span class="ow">or</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_try_ops_on_diff_frame</span><span class="p">:</span> |
| <span class="c1"># Try to perform it with 'compute.ops_on_diff_frame' option.</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">tmp_cond_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__tmp_cond_col__"</span><span class="p">)</span> |
| <span class="n">tmp_other_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__tmp_other_col__"</span><span class="p">)</span> |
| |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_cond_col</span><span class="p">]</span> <span class="o">=</span> <span class="n">cond</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_other_col</span><span class="p">]</span> <span class="o">=</span> <span class="n">other</span> |
| |
| <span class="c1"># above logic makes a Spark DataFrame looks like below:</span> |
| <span class="c1"># +-----------------+---+----------------+-----------------+</span> |
| <span class="c1"># |__index_level_0__| 0|__tmp_cond_col__|__tmp_other_col__|</span> |
| <span class="c1"># +-----------------+---+----------------+-----------------+</span> |
| <span class="c1"># | 0| 0| false| 100|</span> |
| <span class="c1"># | 1| 1| false| 200|</span> |
| <span class="c1"># | 3| 3| true| 400|</span> |
| <span class="c1"># | 2| 2| true| 300|</span> |
| <span class="c1"># | 4| 4| true| 500|</span> |
| <span class="c1"># +-----------------+---+----------------+-----------------+</span> |
| <span class="n">condition</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_cond_col</span><span class="p">]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">tmp_other_col</span><span class="p">]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">condition</span><span class="p">],</span> <span class="n">column_labels</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">other</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">condition</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">cond</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.mask"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.mask.html#pyspark.pandas.Series.mask">[docs]</a> <span class="k">def</span> <span class="nf">mask</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cond</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Replace values where the condition is True.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cond : boolean Series</span> |
| <span class="sd"> Where cond is False, keep the original value. Where True,</span> |
| <span class="sd"> replace with corresponding value from other.</span> |
| <span class="sd"> other : scalar, Series</span> |
| <span class="sd"> Entries where cond is True are replaced with corresponding value from other.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> s1 = ps.Series([0, 1, 2, 3, 4])</span> |
| <span class="sd"> >>> s2 = ps.Series([100, 200, 300, 400, 500])</span> |
| <span class="sd"> >>> s1.mask(s1 > 0).sort_index()</span> |
| <span class="sd"> 0 0.0</span> |
| <span class="sd"> 1 NaN</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s1.mask(s1 > 1, 10).sort_index()</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 10</span> |
| <span class="sd"> 3 10</span> |
| <span class="sd"> 4 10</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s1.mask(s1 > 1, s1 + 100).sort_index()</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 102</span> |
| <span class="sd"> 3 103</span> |
| <span class="sd"> 4 104</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s1.mask(s1 > 1, s2).sort_index()</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 300</span> |
| <span class="sd"> 3 400</span> |
| <span class="sd"> 4 500</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="o">~</span><span class="n">cond</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.xs"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.xs.html#pyspark.pandas.Series.xs">[docs]</a> <span class="k">def</span> <span class="nf">xs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Name</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return cross-section from the Series.</span> |
| |
| <span class="sd"> This method takes a `key` argument to select data at a particular</span> |
| <span class="sd"> level of a MultiIndex.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> key : label or tuple of label</span> |
| <span class="sd"> Label contained in the index, or partially in a MultiIndex.</span> |
| <span class="sd"> level : object, defaults to first n levels (n=1 or len(key))</span> |
| <span class="sd"> In case of a key partially contained in a MultiIndex, indicate</span> |
| <span class="sd"> which levels are used. Levels can be referred by label or position.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Cross-section from the original Series</span> |
| <span class="sd"> corresponding to the selected index levels.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> midx = pd.MultiIndex([['a', 'b', 'c'],</span> |
| <span class="sd"> ... ['lama', 'cow', 'falcon'],</span> |
| <span class="sd"> ... ['speed', 'weight', 'length']],</span> |
| <span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 0, 0, 1, 1, 1, 2, 2, 2],</span> |
| <span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span> |
| <span class="sd"> >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],</span> |
| <span class="sd"> ... index=midx)</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> a lama speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> b cow speed 30.0</span> |
| <span class="sd"> weight 250.0</span> |
| <span class="sd"> length 1.5</span> |
| <span class="sd"> c falcon speed 320.0</span> |
| <span class="sd"> weight 1.0</span> |
| <span class="sd"> length 0.3</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Get values at specified index</span> |
| |
| <span class="sd"> >>> s.xs('a')</span> |
| <span class="sd"> lama speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Get values at several indexes</span> |
| |
| <span class="sd"> >>> s.xs(('a', 'lama'))</span> |
| <span class="sd"> speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Get values at specified index and level</span> |
| |
| <span class="sd"> >>> s.xs('lama', level=1)</span> |
| <span class="sd"> a speed 45.0</span> |
| <span class="sd"> weight 200.0</span> |
| <span class="sd"> length 1.2</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">key</span> <span class="o">=</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="mi">0</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="n">scols</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> |
| <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="o">+</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="p">[</span><span class="n">internal</span><span class="o">.</span><span class="n">spark_columns</span><span class="p">[</span><span class="n">lvl</span><span class="p">]</span> <span class="o">==</span> <span class="n">index</span> <span class="k">for</span> <span class="n">lvl</span><span class="p">,</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">level</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> <span class="n">rows</span><span class="p">))</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">scols</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="c1"># if spark_frame has one column and one data, return data only without frame</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| <span class="n">length</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">length</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> |
| <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="p">)</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.pct_change"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.pct_change.html#pyspark.pandas.Series.pct_change">[docs]</a> <span class="k">def</span> <span class="nf">pct_change</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Percentage change between the current and a prior element.</span> |
| |
| <span class="sd"> .. note:: the current implementation of this API uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int, default 1</span> |
| <span class="sd"> Periods to shift for forming percent change.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> psser = ps.Series([90, 91, 85], index=[2, 4, 1])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 2 90</span> |
| <span class="sd"> 4 91</span> |
| <span class="sd"> 1 85</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> psser.pct_change()</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 4 0.011111</span> |
| <span class="sd"> 1 -0.065934</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> psser.sort_index().pct_change()</span> |
| <span class="sd"> 1 NaN</span> |
| <span class="sd"> 2 0.058824</span> |
| <span class="sd"> 4 0.011111</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> psser.pct_change(periods=2)</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> 1 -0.055556</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="n">periods</span><span class="p">,</span> <span class="o">-</span><span class="n">periods</span><span class="p">)</span> |
| <span class="n">prev_row</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">((</span><span class="n">scol</span> <span class="o">-</span> <span class="n">prev_row</span><span class="p">)</span> <span class="o">/</span> <span class="n">prev_row</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">analyzed</span></div> |
| |
| <div class="viewcode-block" id="Series.combine_first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.combine_first.html#pyspark.pandas.Series.combine_first">[docs]</a> <span class="k">def</span> <span class="nf">combine_first</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Combine Series values, choosing the calling Series's values first.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series</span> |
| <span class="sd"> The value(s) to be combined with the `Series`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> The result of combining the Series with the other object.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.combine : Perform elementwise operation on two Series</span> |
| <span class="sd"> using a given function.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Result index will be the union of the two indexes.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s1 = ps.Series([1, np.nan])</span> |
| <span class="sd"> >>> s2 = ps.Series([3, 4])</span> |
| <span class="sd"> >>> with ps.option_context("compute.ops_on_diff_frames", True):</span> |
| <span class="sd"> ... s1.combine_first(s2)</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 4.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"`combine_first` only allows `Series` for parameter `other`"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="n">this</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">that</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">combined</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">_psdf</span><span class="p">)</span> |
| <span class="n">this</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"this"</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="n">that</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"that"</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="c1"># If `self` has missing value, use value of `other`</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">this</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">that</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">this</span><span class="p">)</span> |
| <span class="c1"># If `self` and `other` come from same frame, the anchor should be kept</span> |
| <span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> <span class="c1"># TODO: dtype?</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">index_scols</span><span class="p">,</span> <span class="n">cond</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">]</span> <span class="c1"># TODO: dtype?</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.dot"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.dot.html#pyspark.pandas.Series.dot">[docs]</a> <span class="k">def</span> <span class="nf">dot</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute the dot product between the Series and the columns of other.</span> |
| |
| <span class="sd"> This method computes the dot product between the Series and another</span> |
| <span class="sd"> one, or the Series and each columns of a DataFrame.</span> |
| |
| <span class="sd"> It can also be called using `self @ other` in Python >= 3.5.</span> |
| |
| <span class="sd"> .. note:: This API is slightly different from pandas when indexes from both Series</span> |
| <span class="sd"> are not aligned and config 'compute.eager_check' is False. pandas raises an exception;</span> |
| <span class="sd"> however, pandas-on-Spark just proceeds and performs by ignoring mismatches with NaN</span> |
| <span class="sd"> permissively.</span> |
| |
| <span class="sd"> >>> pdf1 = pd.Series([1, 2, 3], index=[0, 1, 2])</span> |
| <span class="sd"> >>> pdf2 = pd.Series([1, 2, 3], index=[0, 1, 3])</span> |
| <span class="sd"> >>> pdf1.dot(pdf2) # doctest: +SKIP</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: matrices are not aligned</span> |
| |
| <span class="sd"> >>> psdf1 = ps.Series([1, 2, 3], index=[0, 1, 2])</span> |
| <span class="sd"> >>> psdf2 = ps.Series([1, 2, 3], index=[0, 1, 3])</span> |
| <span class="sd"> >>> with ps.option_context("compute.eager_check", False):</span> |
| <span class="sd"> ... psdf1.dot(psdf2) # doctest: +SKIP</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> 5</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series, DataFrame.</span> |
| <span class="sd"> The other object to compute the dot product with its columns.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar, Series</span> |
| <span class="sd"> Return the dot product of the Series and other if other is a</span> |
| <span class="sd"> Series, the Series of the dot product of Series and each rows of</span> |
| <span class="sd"> other if other is a DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The Series and other has to share the same index if other is a Series</span> |
| <span class="sd"> or a DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([0, 1, 2, 3])</span> |
| |
| <span class="sd"> >>> s.dot(s)</span> |
| <span class="sd"> 14</span> |
| |
| <span class="sd"> >>> s @ s</span> |
| <span class="sd"> 14</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'x': [0, 1, 2, 3], 'y': [0, -1, -2, -3]})</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> 0 0 0</span> |
| <span class="sd"> 1 1 -1</span> |
| <span class="sd"> 2 2 -2</span> |
| <span class="sd"> 3 3 -3</span> |
| |
| <span class="sd"> >>> with ps.option_context("compute.ops_on_diff_frames", True):</span> |
| <span class="sd"> ... s.dot(psdf)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> x 14</span> |
| <span class="sd"> y -14</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.eager_check"</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">sort_values</span><span class="p">()</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span> |
| <span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">sort_values</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"matrices are not aligned"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"matrices are not aligned"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">other_copy</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">other_copy</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| |
| <span class="n">self_column_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">other_copy</span><span class="p">,</span> <span class="s2">"__self_column__"</span><span class="p">)</span> |
| <span class="n">other_copy</span><span class="p">[</span><span class="n">self_column_label</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="n">self_psser</span> <span class="o">=</span> <span class="n">other_copy</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">self_column_label</span><span class="p">)</span> |
| |
| <span class="n">product_pssers</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">other_copy</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">*</span> <span class="n">self_psser</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">]</span> |
| |
| <span class="n">dot_product_psser</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">other_copy</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">product_pssers</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">dot_product_psser</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> |
| <span class="k">return</span> <span class="p">(</span><span class="bp">self</span> <span class="o">*</span> <span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span></div> |
| |
| <span class="k">def</span> <span class="fm">__matmul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Matrix multiplication using binary `@` operator in Python>=3.5.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="Series.repeat"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.repeat.html#pyspark.pandas.Series.repeat">[docs]</a> <span class="k">def</span> <span class="nf">repeat</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">repeats</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Repeat elements of a Series.</span> |
| |
| <span class="sd"> Returns a new Series where each element of the current Series</span> |
| <span class="sd"> is repeated consecutively a given number of times.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> repeats : int or Series</span> |
| <span class="sd"> The number of repetitions for each element. This should be a</span> |
| <span class="sd"> non-negative integer. Repeating 0 times will return an empty</span> |
| <span class="sd"> Series.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Newly created Series with repeated elements.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Index.repeat : Equivalent function for Index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(['a', 'b', 'c'])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> s.repeat(2)</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> ps.Series([1, 2, 3]).repeat(0)</span> |
| <span class="sd"> Series([], dtype: int64)</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">repeats</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"`repeats` argument must be integer or Series, but got </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">repeats</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">repeats</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">repeats</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">temp_repeats</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__temp_repeats__"</span><span class="p">)</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">temp_repeats</span><span class="p">]</span> <span class="o">=</span> <span class="n">repeats</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="o">.</span><span class="n">repeat</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">temp_repeats</span><span class="p">])</span> |
| <span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">array_repeat</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">repeats</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s2">"int32"</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="p">[</span><span class="n">scol</span><span class="p">])</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">))],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">repeats</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"negative dimensions are not allowed"</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="n">repeats</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="s2">"ps.DataFrame"</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">psdf</span><span class="p">]</span> <span class="o">*</span> <span class="n">repeats</span><span class="p">)))</span></div> |
| |
| <div class="viewcode-block" id="Series.asof"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.asof.html#pyspark.pandas.Series.asof">[docs]</a> <span class="k">def</span> <span class="nf">asof</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">where</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the last row(s) without any NaNs before `where`.</span> |
| |
| <span class="sd"> The last row (for each element in `where`, if list) without any</span> |
| <span class="sd"> NaN is taken.</span> |
| |
| <span class="sd"> If there is no good value, NaN is returned.</span> |
| |
| <span class="sd"> .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`</span> |
| <span class="sd"> which is expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> where : index or array-like of indices</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar or Series</span> |
| |
| <span class="sd"> The return can be:</span> |
| |
| <span class="sd"> * scalar : when `self` is a Series and `where` is a scalar</span> |
| <span class="sd"> * Series: when `self` is a Series and `where` is an array-like</span> |
| |
| <span class="sd"> Return scalar or Series</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Indices are assumed to be sorted. Raises if this is not the case and config</span> |
| <span class="sd"> 'compute.eager_check' is True. If 'compute.eager_check' is False pandas-on-Spark just</span> |
| <span class="sd"> proceeds and performs by ignoring the indeces's order</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 10 1.0</span> |
| <span class="sd"> 20 2.0</span> |
| <span class="sd"> 30 NaN</span> |
| <span class="sd"> 40 4.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> A scalar `where`.</span> |
| |
| <span class="sd"> >>> s.asof(20)</span> |
| <span class="sd"> 2.0</span> |
| |
| <span class="sd"> For a sequence `where`, a Series is returned. The first value is</span> |
| <span class="sd"> NaN, because the first element of `where` is before the first</span> |
| <span class="sd"> index value.</span> |
| |
| <span class="sd"> >>> s.asof([5, 20]).sort_index()</span> |
| <span class="sd"> 5 NaN</span> |
| <span class="sd"> 20 2.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Missing values are not considered. The following is ``2.0``, not</span> |
| <span class="sd"> NaN, even though NaN is at the index location for ``30``.</span> |
| |
| <span class="sd"> >>> s.asof(30)</span> |
| <span class="sd"> 2.0</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, np.nan, 4], index=[10, 30, 20, 40])</span> |
| <span class="sd"> >>> with ps.option_context("compute.eager_check", False):</span> |
| <span class="sd"> ... s.asof(20)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> """</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"asof is not supported for a MultiIndex"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">where</span><span class="p">,</span> <span class="p">(</span><span class="n">ps</span><span class="o">.</span><span class="n">Index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"where cannot be an Index, Series or a DataFrame"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.eager_check"</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">is_monotonic_increasing</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"asof requires a sorted index"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">where</span><span class="p">):</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="n">where</span> <span class="o">=</span> <span class="p">[</span><span class="n">where</span><span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">index_scol</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">index_type</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">index_scol</span><span class="p">)</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">monotonically_increasing_id_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">,</span> <span class="s2">"__monotonically_increasing_id__"</span> |
| <span class="p">)</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">max_by</span><span class="p">(</span> |
| <span class="n">spark_column</span><span class="p">,</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="p">(</span><span class="n">index_scol</span> <span class="o"><=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">index</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">index_type</span><span class="p">))</span> <span class="o">&</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">notna</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> |
| <span class="c1"># If index is nan and the value of the col is not null</span> |
| <span class="c1"># then return monotonically_increasing_id .This will let max by</span> |
| <span class="c1"># to return last index value , which is the behaviour of pandas</span> |
| <span class="k">else</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> |
| <span class="n">monotonically_increasing_id_column</span><span class="p">,</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="n">where</span> |
| <span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">monotonically_increasing_id_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">monotonically_increasing_id</span><span class="p">()</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="k">with</span> <span class="n">sql_conf</span><span class="p">({</span><span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">:</span> <span class="kc">False</span><span class="p">}):</span> |
| <span class="c1"># Disable Arrow to keep row ordering.</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">result</span> <span class="k">if</span> <span class="n">result</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> |
| |
| <span class="c1"># The data is expected to be small so it's fine to transpose/use default index.</span> |
| <span class="k">with</span> <span class="n">ps</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.default_index_type"</span><span class="p">,</span> <span class="s2">"distributed"</span><span class="p">,</span> <span class="s2">"compute.max_rows"</span><span class="p">,</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">where</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">where</span><span class="p">))</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_type</span><span class="p">,</span> <span class="n">TimestampType</span><span class="p">):</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">where</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">transpose</span><span class="p">())</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># If `where` has duplicate items, leverage the pandas directly</span> |
| <span class="c1"># since pandas API on Spark doesn't support the duplicate column name.</span> |
| <span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| <span class="n">pdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">where</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">transpose</span><span class="p">()))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.mad"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.mad.html#pyspark.pandas.Series.mad">[docs]</a> <span class="k">def</span> <span class="nf">mad</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the mean absolute deviation of values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4])</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.mad()</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">avg</span> <span class="o">=</span> <span class="n">unpack_scalar</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">avg</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)))</span> |
| <span class="n">mad</span> <span class="o">=</span> <span class="n">unpack_scalar</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">avg</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">spark_column</span> <span class="o">-</span> <span class="n">avg</span><span class="p">))))</span> |
| |
| <span class="k">return</span> <span class="n">mad</span></div> |
| |
| <div class="viewcode-block" id="Series.unstack"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.unstack.html#pyspark.pandas.Series.unstack">[docs]</a> <span class="k">def</span> <span class="nf">unstack</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.</span> |
| <span class="sd"> The level involved will automatically get sorted.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Unlike pandas, pandas-on-Spark doesn't check whether an index is duplicated or not</span> |
| <span class="sd"> because the checking of duplicated index requires scanning whole data which</span> |
| <span class="sd"> can be quite expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> level : int, str, or list of these, default last level</span> |
| <span class="sd"> Level(s) to unstack, can pass level name.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Unstacked Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series([1, 2, 3, 4],</span> |
| <span class="sd"> ... index=pd.MultiIndex.from_product([['one', 'two'],</span> |
| <span class="sd"> ... ['a', 'b']]))</span> |
| <span class="sd"> >>> s</span> |
| <span class="sd"> one a 1</span> |
| <span class="sd"> b 2</span> |
| <span class="sd"> two a 3</span> |
| <span class="sd"> b 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> s.unstack(level=-1).sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> one 1 2</span> |
| <span class="sd"> two 3 4</span> |
| |
| <span class="sd"> >>> s.unstack(level=0).sort_index()</span> |
| <span class="sd"> one two</span> |
| <span class="sd"> a 1 3</span> |
| <span class="sd"> b 2 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Series.unstack only support for a MultiIndex"</span><span class="p">)</span> |
| <span class="n">index_nlevels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">nlevels</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="p">(</span><span class="n">level</span> <span class="o">></span> <span class="n">index_nlevels</span> <span class="o">-</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">{}</span><span class="s2"> levels, not </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">index_nlevels</span><span class="p">,</span> <span class="n">level</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">level</span> <span class="o"><</span> <span class="mi">0</span> <span class="ow">and</span> <span class="p">(</span><span class="n">level</span> <span class="o"><</span> <span class="o">-</span><span class="n">index_nlevels</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">{}</span><span class="s2"> levels, </span><span class="si">{}</span><span class="s2"> is not a valid level number"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">index_nlevels</span><span class="p">,</span> <span class="n">level</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| |
| <span class="n">index_map</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> |
| <span class="nb">zip</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">pivot_col</span><span class="p">,</span> <span class="n">column_label_names</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">index_map</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">level</span><span class="p">)</span> |
| <span class="n">index_scol_names</span><span class="p">,</span> <span class="n">index_names</span><span class="p">,</span> <span class="n">index_fields</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">index_map</span><span class="p">)</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">index_scol_names</span><span class="p">))</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">pivot_col</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)))</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_scol_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_names</span><span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_fields</span><span class="p">),</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="p">[</span><span class="n">column_label_names</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.item"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.item.html#pyspark.pandas.Series.item">[docs]</a> <span class="k">def</span> <span class="nf">item</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Scalar</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the first element of the underlying data as a Python scalar.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> scalar</span> |
| <span class="sd"> The first element of Series.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> ValueError</span> |
| <span class="sd"> If the data is not length-1.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series([10])</span> |
| <span class="sd"> >>> psser.item()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="Series.iteritems"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.iteritems.html#pyspark.pandas.Series.iteritems">[docs]</a> <span class="k">def</span> <span class="nf">iteritems</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Lazily iterate over (index, value) tuples.</span> |
| |
| <span class="sd"> This method returns an iterable tuple (index, value). This is</span> |
| <span class="sd"> convenient if you want to create a lazy iterator.</span> |
| |
| <span class="sd"> .. note:: Unlike pandas', the iteritems in pandas-on-Spark returns generator rather</span> |
| <span class="sd"> zip object</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> iterable</span> |
| <span class="sd"> Iterable of tuples containing the (index, value) pairs from a</span> |
| <span class="sd"> Series.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.items : Iterate over (column name, Series) pairs.</span> |
| <span class="sd"> DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = ps.Series(['A', 'B', 'C'])</span> |
| <span class="sd"> >>> for index, value in s.items():</span> |
| <span class="sd"> ... print("Index : {}, Value : {}".format(index, value))</span> |
| <span class="sd"> Index : 0, Value : A</span> |
| <span class="sd"> Index : 1, Value : B</span> |
| <span class="sd"> Index : 2, Value : C</span> |
| <span class="sd"> """</span> |
| <span class="n">internal_index_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">internal_data_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="nf">extract_kv_from_spark_row</span><span class="p">(</span><span class="n">row</span><span class="p">:</span> <span class="n">Row</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">]:</span> |
| <span class="n">k</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">row</span><span class="p">[</span><span class="n">internal_index_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">internal_index_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="k">else</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">internal_index_columns</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">v</span> <span class="o">=</span> <span class="n">row</span><span class="p">[</span><span class="n">internal_data_column</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> |
| |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">map</span><span class="p">(</span> |
| <span class="n">extract_kv_from_spark_row</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">toLocalIterator</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="k">yield</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span></div> |
| |
| <div class="viewcode-block" id="Series.items"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.items.html#pyspark.pandas.Series.items">[docs]</a> <span class="k">def</span> <span class="nf">items</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]:</span> |
| <span class="sd">"""This is an alias of ``iteritems``."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="Series.droplevel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.droplevel.html#pyspark.pandas.Series.droplevel">[docs]</a> <span class="k">def</span> <span class="nf">droplevel</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]]])</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return Series with requested index level(s) removed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> level : int, str, or list-like</span> |
| <span class="sd"> If a string is given, must be the name of a level</span> |
| <span class="sd"> If list-like, elements must be names or positional indexes</span> |
| <span class="sd"> of levels.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Series with requested index level(s) removed.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series(</span> |
| <span class="sd"> ... [1, 2, 3],</span> |
| <span class="sd"> ... index=pd.MultiIndex.from_tuples(</span> |
| <span class="sd"> ... [("x", "a"), ("x", "b"), ("y", "c")], names=["level_1", "level_2"]</span> |
| <span class="sd"> ... ),</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> level_1 level_2</span> |
| <span class="sd"> x a 1</span> |
| <span class="sd"> b 2</span> |
| <span class="sd"> y c 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Removing specific index level by level</span> |
| |
| <span class="sd"> >>> psser.droplevel(0)</span> |
| <span class="sd"> level_2</span> |
| <span class="sd"> a 1</span> |
| <span class="sd"> b 2</span> |
| <span class="sd"> c 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Removing specific index level by name</span> |
| |
| <span class="sd"> >>> psser.droplevel("level_2")</span> |
| <span class="sd"> level_1</span> |
| <span class="sd"> x 1</span> |
| <span class="sd"> x 2</span> |
| <span class="sd"> y 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">droplevel</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">level</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.tail"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.tail.html#pyspark.pandas.Series.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the last `n` rows.</span> |
| |
| <span class="sd"> This function returns last `n` rows from the object based on</span> |
| <span class="sd"> position. It is useful for quickly verifying data, for example,</span> |
| <span class="sd"> after sorting or appending rows.</span> |
| |
| <span class="sd"> For negative values of `n`, this function returns all rows except</span> |
| <span class="sd"> the first `n` rows, equivalent to ``df[n:]``.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, default 5</span> |
| <span class="sd"> Number of rows to select.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> type of caller</span> |
| <span class="sd"> The last `n` rows of the caller object.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.head : The first `n` rows of the caller object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3, 4, 5])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> 4 5</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> psser.tail(3) # doctest: +SKIP</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> 4 5</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">n</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.explode"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.explode.html#pyspark.pandas.Series.explode">[docs]</a> <span class="k">def</span> <span class="nf">explode</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Transform each element of a list-like to a row.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Exploded lists to rows; index will be duplicated for these rows.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.str.split : Split string values on specified separator.</span> |
| <span class="sd"> Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex</span> |
| <span class="sd"> to produce DataFrame.</span> |
| <span class="sd"> DataFrame.melt : Unpivot a DataFrame from wide format to long format.</span> |
| <span class="sd"> DataFrame.explode : Explode a DataFrame from list-like</span> |
| <span class="sd"> columns to long format.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series([[1, 2, 3], [], [3, 4]])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 [1, 2, 3]</span> |
| <span class="sd"> 1 []</span> |
| <span class="sd"> 2 [3, 4]</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> psser.explode() # doctest: +SKIP</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 0 3.0</span> |
| <span class="sd"> 1 NaN</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 2 4.0</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">ArrayType</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">explode_outer</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_label</span><span class="p">))</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">([</span><span class="n">scol</span><span class="p">],</span> <span class="n">keep_order</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="Series.argsort"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.argsort.html#pyspark.pandas.Series.argsort">[docs]</a> <span class="k">def</span> <span class="nf">argsort</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the integer indices that would sort the Series values.</span> |
| <span class="sd"> Unlike pandas, the index order is not preserved in the result.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Positions of values within the sort order with -1 indicating</span> |
| <span class="sd"> nan values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psser = ps.Series([3, 3, 4, 1, 6, 2, 3, 7, 8, 7, 10])</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 4</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> 4 6</span> |
| <span class="sd"> 5 2</span> |
| <span class="sd"> 6 3</span> |
| <span class="sd"> 7 7</span> |
| <span class="sd"> 8 8</span> |
| <span class="sd"> 9 7</span> |
| <span class="sd"> 10 10</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> psser.argsort().sort_index()</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> 2 0</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> 4 6</span> |
| <span class="sd"> 5 2</span> |
| <span class="sd"> 6 4</span> |
| <span class="sd"> 7 7</span> |
| <span class="sd"> 8 9</span> |
| <span class="sd"> 9 8</span> |
| <span class="sd"> 10 10</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">notnull</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">notnull</span><span class="p">()]</span> |
| |
| <span class="n">sdf_for_index</span> <span class="o">=</span> <span class="n">notnull</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">notnull</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| |
| <span class="n">tmp_join_key</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf_for_index</span><span class="p">,</span> <span class="s2">"__tmp_join_key__"</span><span class="p">)</span> |
| <span class="n">sdf_for_index</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">attach_distributed_sequence_column</span><span class="p">(</span> |
| <span class="n">sdf_for_index</span><span class="p">,</span> <span class="n">tmp_join_key</span> |
| <span class="p">)</span> |
| <span class="c1"># sdf_for_index:</span> |
| <span class="c1"># +----------------+-----------------+</span> |
| <span class="c1"># |__tmp_join_key__|__index_level_0__|</span> |
| <span class="c1"># +----------------+-----------------+</span> |
| <span class="c1"># | 0| 0|</span> |
| <span class="c1"># | 1| 1|</span> |
| <span class="c1"># | 2| 2|</span> |
| <span class="c1"># | 3| 3|</span> |
| <span class="c1"># | 4| 4|</span> |
| <span class="c1"># +----------------+-----------------+</span> |
| |
| <span class="n">sdf_for_data</span> <span class="o">=</span> <span class="n">notnull</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">notnull</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"values"</span><span class="p">),</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span> |
| <span class="p">)</span> |
| <span class="n">sdf_for_data</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">attach_distributed_sequence_column</span><span class="p">(</span> |
| <span class="n">sdf_for_data</span><span class="p">,</span> <span class="n">SPARK_DEFAULT_SERIES_NAME</span> |
| <span class="p">)</span> |
| <span class="c1"># sdf_for_data:</span> |
| <span class="c1"># +---+------+-----------------+</span> |
| <span class="c1"># | 0|values|__natural_order__|</span> |
| <span class="c1"># +---+------+-----------------+</span> |
| <span class="c1"># | 0| 3| 25769803776|</span> |
| <span class="c1"># | 1| 3| 51539607552|</span> |
| <span class="c1"># | 2| 4| 77309411328|</span> |
| <span class="c1"># | 3| 1| 103079215104|</span> |
| <span class="c1"># | 4| 2| 128849018880|</span> |
| <span class="c1"># +---+------+-----------------+</span> |
| |
| <span class="n">sdf_for_data</span> <span class="o">=</span> <span class="n">sdf_for_data</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf_for_data</span><span class="p">,</span> <span class="s2">"values"</span><span class="p">),</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s2">"values"</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">tmp_join_key</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf_for_data</span><span class="p">,</span> <span class="s2">"__tmp_join_key__"</span><span class="p">)</span> |
| <span class="n">sdf_for_data</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">attach_distributed_sequence_column</span><span class="p">(</span><span class="n">sdf_for_data</span><span class="p">,</span> <span class="n">tmp_join_key</span><span class="p">)</span> |
| <span class="c1"># sdf_for_index: sdf_for_data:</span> |
| <span class="c1"># +----------------+-----------------+ +----------------+---+</span> |
| <span class="c1"># |__tmp_join_key__|__index_level_0__| |__tmp_join_key__| 0|</span> |
| <span class="c1"># +----------------+-----------------+ +----------------+---+</span> |
| <span class="c1"># | 0| 0| | 0| 3|</span> |
| <span class="c1"># | 1| 1| | 1| 4|</span> |
| <span class="c1"># | 2| 2| | 2| 0|</span> |
| <span class="c1"># | 3| 3| | 3| 1|</span> |
| <span class="c1"># | 4| 4| | 4| 2|</span> |
| <span class="c1"># +----------------+-----------------+ +----------------+---+</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf_for_index</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">sdf_for_data</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="n">tmp_join_key</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_join_key</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">data_columns</span><span class="o">=</span><span class="p">[</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">field</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="n">Series</span><span class="p">,</span> |
| <span class="n">ps</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">psser</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">()]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">))]),</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.argmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.argmax.html#pyspark.pandas.Series.argmax">[docs]</a> <span class="k">def</span> <span class="nf">argmax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return int position of the largest value in the Series.</span> |
| |
| <span class="sd"> If the maximum is achieved in multiple locations,</span> |
| <span class="sd"> the first row position is returned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> int</span> |
| <span class="sd"> Row position of the maximum value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Consider dataset containing cereal calories</span> |
| |
| <span class="sd"> >>> s = ps.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0,</span> |
| <span class="sd"> ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0})</span> |
| <span class="sd"> >>> s # doctest: +SKIP</span> |
| <span class="sd"> Corn Flakes 100.0</span> |
| <span class="sd"> Almond Delight 110.0</span> |
| <span class="sd"> Cinnamon Toast Crunch 120.0</span> |
| <span class="sd"> Cocoa Puff 110.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.argmax() # doctest: +SKIP</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="n">max_value</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">max_value</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"attempt to get argmax of an empty sequence"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">max_value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="o">-</span><span class="mi">1</span> |
| <span class="c1"># We should remember the natural sequence started from 0</span> |
| <span class="n">seq_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__distributed_sequence_column__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">attach_distributed_sequence_column</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span> <span class="n">seq_col_name</span> |
| <span class="p">)</span> |
| <span class="c1"># If the maximum is achieved in multiple locations, the first row position is returned.</span> |
| <span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">==</span> <span class="n">max_value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span></div> |
| |
| <div class="viewcode-block" id="Series.argmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.argmin.html#pyspark.pandas.Series.argmin">[docs]</a> <span class="k">def</span> <span class="nf">argmin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return int position of the smallest value in the Series.</span> |
| |
| <span class="sd"> If the minimum is achieved in multiple locations,</span> |
| <span class="sd"> the first row position is returned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> int</span> |
| <span class="sd"> Row position of the minimum value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Consider dataset containing cereal calories</span> |
| |
| <span class="sd"> >>> s = ps.Series({'Corn Flakes': 100.0, 'Almond Delight': 110.0,</span> |
| <span class="sd"> ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0})</span> |
| <span class="sd"> >>> s # doctest: +SKIP</span> |
| <span class="sd"> Corn Flakes 100.0</span> |
| <span class="sd"> Almond Delight 110.0</span> |
| <span class="sd"> Cinnamon Toast Crunch 120.0</span> |
| <span class="sd"> Cocoa Puff 110.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> s.argmin() # doctest: +SKIP</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> """</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="n">min_value</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">min_value</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"attempt to get argmin of an empty sequence"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">min_value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="o">-</span><span class="mi">1</span> |
| <span class="c1"># We should remember the natural sequence started from 0</span> |
| <span class="n">seq_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__distributed_sequence_column__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">attach_distributed_sequence_column</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span> <span class="n">seq_col_name</span> |
| <span class="p">)</span> |
| <span class="c1"># If the minimum is achieved in multiple locations, the first row position is returned.</span> |
| <span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">==</span> <span class="n">min_value</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span></div> |
| |
| <div class="viewcode-block" id="Series.compare"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.compare.html#pyspark.pandas.Series.compare">[docs]</a> <span class="k">def</span> <span class="nf">compare</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">keep_shape</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">keep_equal</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare to another Series and show the differences.</span> |
| |
| <span class="sd"> .. note:: This API is slightly different from pandas when indexes from both Series</span> |
| <span class="sd"> are not identical and config 'compute.eager_check' is False. pandas raises an exception;</span> |
| <span class="sd"> however, pandas-on-Spark just proceeds and performs by ignoring mismatches.</span> |
| |
| <span class="sd"> >>> psser1 = ps.Series([1, 2, 3, 4, 5], index=pd.Index([1, 2, 3, 4, 5]))</span> |
| <span class="sd"> >>> psser2 = ps.Series([1, 2, 3, 4, 5], index=pd.Index([1, 2, 4, 3, 6]))</span> |
| <span class="sd"> >>> psser1.compare(psser2) # doctest: +SKIP</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: Can only compare identically-labeled Series objects</span> |
| |
| <span class="sd"> >>> with ps.option_context("compute.eager_check", False):</span> |
| <span class="sd"> ... psser1.compare(psser2) # doctest: +SKIP</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> self other</span> |
| <span class="sd"> 3 3.0 4.0</span> |
| <span class="sd"> 4 4.0 3.0</span> |
| <span class="sd"> 5 5.0 NaN</span> |
| <span class="sd"> 6 NaN 5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series</span> |
| <span class="sd"> Object to compare with.</span> |
| <span class="sd"> keep_shape : bool, default False</span> |
| <span class="sd"> If true, all rows and columns are kept.</span> |
| <span class="sd"> Otherwise, only the ones with different values are kept.</span> |
| <span class="sd"> keep_equal : bool, default False</span> |
| <span class="sd"> If true, the result keeps values that are equal.</span> |
| <span class="sd"> Otherwise, equal values are shown as NaNs.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Matching NaNs will not appear as a difference.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> s1 = ps.Series(["a", "b", "c", "d", "e"])</span> |
| <span class="sd"> >>> s2 = ps.Series(["a", "a", "c", "b", "e"])</span> |
| |
| <span class="sd"> Align the differences on columns</span> |
| |
| <span class="sd"> >>> s1.compare(s2).sort_index()</span> |
| <span class="sd"> self other</span> |
| <span class="sd"> 1 b a</span> |
| <span class="sd"> 3 d b</span> |
| |
| <span class="sd"> Keep all original rows</span> |
| |
| <span class="sd"> >>> s1.compare(s2, keep_shape=True).sort_index()</span> |
| <span class="sd"> self other</span> |
| <span class="sd"> 0 None None</span> |
| <span class="sd"> 1 b a</span> |
| <span class="sd"> 2 None None</span> |
| <span class="sd"> 3 d b</span> |
| <span class="sd"> 4 None None</span> |
| |
| <span class="sd"> Keep all original rows and also all original values</span> |
| |
| <span class="sd"> >>> s1.compare(s2, keep_shape=True, keep_equal=True).sort_index()</span> |
| <span class="sd"> self other</span> |
| <span class="sd"> 0 a a</span> |
| <span class="sd"> 1 b a</span> |
| <span class="sd"> 2 c c</span> |
| <span class="sd"> 3 d b</span> |
| <span class="sd"> 4 e e</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="n">combined</span><span class="p">:</span> <span class="n">DataFrame</span> |
| <span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="n">self_column_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="s2">"__self_column__"</span><span class="p">)</span> |
| <span class="n">other_column_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="s2">"__other_column__"</span><span class="p">)</span> |
| <span class="n">combined</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">self_column_label</span><span class="p">),</span> <span class="n">other</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">other_column_label</span><span class="p">)]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.eager_check"</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Can only compare identically-labeled Series objects"</span><span class="p">)</span> |
| |
| <span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">other</span><span class="o">.</span><span class="n">to_frame</span><span class="p">())</span> |
| |
| <span class="n">this_column_label</span> <span class="o">=</span> <span class="s2">"self"</span> |
| <span class="n">that_column_label</span> <span class="o">=</span> <span class="s2">"other"</span> |
| <span class="k">if</span> <span class="n">keep_equal</span> <span class="ow">and</span> <span class="n">keep_shape</span><span class="p">:</span> |
| <span class="n">combined</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">([</span><span class="n">this_column_label</span><span class="p">,</span> <span class="n">that_column_label</span><span class="p">])</span> |
| <span class="k">return</span> <span class="n">combined</span> |
| |
| <span class="n">this_data_scol</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">that_data_scol</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">if</span> <span class="n">keep_shape</span><span class="p">:</span> |
| <span class="n">this_scol</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">this_data_scol</span> <span class="o">==</span> <span class="n">that_data_scol</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">this_data_scol</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">this_column_label</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">this_field</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">this_column_label</span><span class="p">,</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span> |
| |
| <span class="n">that_scol</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">this_data_scol</span> <span class="o">==</span> <span class="n">that_data_scol</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">that_data_scol</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">that_column_label</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">that_field</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">that_column_label</span><span class="p">,</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="o">~</span><span class="n">this_data_scol</span><span class="o">.</span><span class="n">eqNullSafe</span><span class="p">(</span><span class="n">that_data_scol</span><span class="p">))</span> |
| |
| <span class="n">this_scol</span> <span class="o">=</span> <span class="n">this_data_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">this_column_label</span><span class="p">)</span> |
| <span class="n">this_field</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">this_column_label</span><span class="p">)</span> |
| |
| <span class="n">that_scol</span> <span class="o">=</span> <span class="n">that_data_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">that_column_label</span><span class="p">)</span> |
| <span class="n">that_field</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">that_column_label</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">index_scols</span><span class="p">,</span> <span class="n">this_scol</span><span class="p">,</span> <span class="n">that_scol</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[(</span><span class="n">this_column_label</span><span class="p">,),</span> <span class="p">(</span><span class="n">that_column_label</span><span class="p">,)],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">this_column_label</span><span class="p">),</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">that_column_label</span><span class="p">)],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">this_field</span><span class="p">,</span> <span class="n">that_field</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.align"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.align.html#pyspark.pandas.Series.align">[docs]</a> <span class="k">def</span> <span class="nf">align</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">other</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">],</span> |
| <span class="n">join</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"outer"</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">copy</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Align two objects on their axes with the specified join method.</span> |
| |
| <span class="sd"> Join method is specified for each axis Index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : DataFrame or Series</span> |
| <span class="sd"> join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'</span> |
| <span class="sd"> axis : allowed axis of the other object, default None</span> |
| <span class="sd"> Align on index (0), columns (1), or both (None).</span> |
| <span class="sd"> copy : bool, default True</span> |
| <span class="sd"> Always returns new objects. If copy=False and no reindexing is</span> |
| <span class="sd"> required then original objects are returned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> (left, right) : (Series, type of other)</span> |
| <span class="sd"> Aligned objects.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> s1 = ps.Series([7, 8, 9], index=[10, 11, 12])</span> |
| <span class="sd"> >>> s2 = ps.Series(["g", "h", "i"], index=[10, 20, 30])</span> |
| |
| <span class="sd"> >>> aligned_l, aligned_r = s1.align(s2)</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> 10 7.0</span> |
| <span class="sd"> 11 8.0</span> |
| <span class="sd"> 12 9.0</span> |
| <span class="sd"> 20 NaN</span> |
| <span class="sd"> 30 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> 10 g</span> |
| <span class="sd"> 11 None</span> |
| <span class="sd"> 12 None</span> |
| <span class="sd"> 20 h</span> |
| <span class="sd"> 30 i</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> Align with the join type "inner":</span> |
| |
| <span class="sd"> >>> aligned_l, aligned_r = s1.align(s2, join="inner")</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> 10 7</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> 10 g</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> Align with a DataFrame:</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30])</span> |
| <span class="sd"> >>> aligned_l, aligned_r = s1.align(df)</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> 10 7.0</span> |
| <span class="sd"> 11 8.0</span> |
| <span class="sd"> 12 9.0</span> |
| <span class="sd"> 20 NaN</span> |
| <span class="sd"> 30 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 10 1.0 a</span> |
| <span class="sd"> 11 NaN None</span> |
| <span class="sd"> 12 NaN None</span> |
| <span class="sd"> 20 2.0 b</span> |
| <span class="sd"> 30 3.0 c</span> |
| |
| <span class="sd"> >>> ps.reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Series does not support columns axis."</span><span class="p">)</span> |
| |
| <span class="n">self_df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">left</span><span class="p">,</span> <span class="n">right</span> <span class="o">=</span> <span class="n">self_df</span><span class="o">.</span><span class="n">align</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">join</span><span class="o">=</span><span class="n">join</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">left</span> <span class="ow">is</span> <span class="n">self_df</span><span class="p">:</span> |
| <span class="n">left_ser</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">left_ser</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">left</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="p">(</span><span class="n">left_ser</span><span class="o">.</span><span class="n">copy</span><span class="p">(),</span> <span class="n">right</span><span class="o">.</span><span class="n">copy</span><span class="p">())</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="p">(</span><span class="n">left_ser</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.between_time"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.between_time.html#pyspark.pandas.Series.between_time">[docs]</a> <span class="k">def</span> <span class="nf">between_time</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">start_time</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">datetime</span><span class="o">.</span><span class="n">time</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">end_time</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">datetime</span><span class="o">.</span><span class="n">time</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">include_start</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">include_end</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select values between particular times of the day (example: 9:00-9:30 AM).</span> |
| |
| <span class="sd"> By setting ``start_time`` to be later than ``end_time``,</span> |
| <span class="sd"> you can get the times that are *not* between the two times.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start_time : datetime.time or str</span> |
| <span class="sd"> Initial time as a time filter limit.</span> |
| <span class="sd"> end_time : datetime.time or str</span> |
| <span class="sd"> End time as a time filter limit.</span> |
| <span class="sd"> include_start : bool, default True</span> |
| <span class="sd"> Whether the start time needs to be included in the result.</span> |
| <span class="sd"> include_end : bool, default True</span> |
| <span class="sd"> Whether the end time needs to be included in the result.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| <span class="sd"> Determine range time on index or columns value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Data from the original object filtered to the specified dates range.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> at_time : Select values at a particular time of the day.</span> |
| <span class="sd"> last : Select final periods of time series based on a date offset.</span> |
| <span class="sd"> DatetimeIndex.indexer_between_time : Get just the index locations for</span> |
| <span class="sd"> values between particular times of the day.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> idx = pd.date_range('2018-04-09', periods=4, freq='1D20min')</span> |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3, 4], index=idx)</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 2018-04-09 00:00:00 1</span> |
| <span class="sd"> 2018-04-10 00:20:00 2</span> |
| <span class="sd"> 2018-04-11 00:40:00 3</span> |
| <span class="sd"> 2018-04-12 01:00:00 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> psser.between_time('0:15', '0:45')</span> |
| <span class="sd"> 2018-04-10 00:20:00 2</span> |
| <span class="sd"> 2018-04-11 00:40:00 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">between_time</span><span class="p">(</span><span class="n">start_time</span><span class="p">,</span> <span class="n">end_time</span><span class="p">,</span> <span class="n">include_start</span><span class="p">,</span> <span class="n">include_end</span><span class="p">,</span> <span class="n">axis</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="Series.at_time"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.at_time.html#pyspark.pandas.Series.at_time">[docs]</a> <span class="k">def</span> <span class="nf">at_time</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">time</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">datetime</span><span class="o">.</span><span class="n">time</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">asof</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select values at particular time of day (example: 9:30AM).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> time : datetime.time or str</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> between_time : Select values between particular times of the day.</span> |
| <span class="sd"> DatetimeIndex.indexer_at_time : Get just the index locations for</span> |
| <span class="sd"> values at particular time of the day.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> idx = pd.date_range('2018-04-09', periods=4, freq='12H')</span> |
| <span class="sd"> >>> psser = ps.Series([1, 2, 3, 4], index=idx)</span> |
| <span class="sd"> >>> psser</span> |
| <span class="sd"> 2018-04-09 00:00:00 1</span> |
| <span class="sd"> 2018-04-09 12:00:00 2</span> |
| <span class="sd"> 2018-04-10 00:00:00 3</span> |
| <span class="sd"> 2018-04-10 12:00:00 4</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> psser.at_time('12:00')</span> |
| <span class="sd"> 2018-04-09 12:00:00 2</span> |
| <span class="sd"> 2018-04-10 12:00:00 4</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">at_time</span><span class="p">(</span><span class="n">time</span><span class="p">,</span> <span class="n">asof</span><span class="p">,</span> <span class="n">axis</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_cum</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> |
| <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="c1"># This is used to cummin, cummax, cumsum, etc.</span> |
| |
| <span class="k">if</span> <span class="n">ascending</span><span class="p">:</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">asc</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">desc</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="c1"># There is a behavior difference between pandas and PySpark. In case of cummax,</span> |
| <span class="c1">#</span> |
| <span class="c1"># Input:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># 0 2.0 1.0</span> |
| <span class="c1"># 1 5.0 NaN</span> |
| <span class="c1"># 2 1.0 0.0</span> |
| <span class="c1"># 3 2.0 4.0</span> |
| <span class="c1"># 4 4.0 9.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># pandas:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># 0 2.0 1.0</span> |
| <span class="c1"># 1 5.0 NaN</span> |
| <span class="c1"># 2 5.0 1.0</span> |
| <span class="c1"># 3 5.0 4.0</span> |
| <span class="c1"># 4 5.0 9.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># PySpark:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># 0 2.0 1.0</span> |
| <span class="c1"># 1 5.0 1.0</span> |
| <span class="c1"># 2 5.0 1.0</span> |
| <span class="c1"># 3 5.0 4.0</span> |
| <span class="c1"># 4 5.0 9.0</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="c1"># Manually sets nulls given the column defined above.</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">func</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Here, we use two Windows.</span> |
| <span class="c1"># One for real data.</span> |
| <span class="c1"># The other one for setting nulls after the first null it meets.</span> |
| <span class="c1">#</span> |
| <span class="c1"># There is a behavior difference between pandas and PySpark. In case of cummax,</span> |
| <span class="c1">#</span> |
| <span class="c1"># Input:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># 0 2.0 1.0</span> |
| <span class="c1"># 1 5.0 NaN</span> |
| <span class="c1"># 2 1.0 0.0</span> |
| <span class="c1"># 3 2.0 4.0</span> |
| <span class="c1"># 4 4.0 9.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># pandas:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># 0 2.0 1.0</span> |
| <span class="c1"># 1 5.0 NaN</span> |
| <span class="c1"># 2 5.0 NaN</span> |
| <span class="c1"># 3 5.0 NaN</span> |
| <span class="c1"># 4 5.0 NaN</span> |
| <span class="c1">#</span> |
| <span class="c1"># PySpark:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># 0 2.0 1.0</span> |
| <span class="c1"># 1 5.0 1.0</span> |
| <span class="c1"># 2 5.0 1.0</span> |
| <span class="c1"># 3 5.0 4.0</span> |
| <span class="c1"># 4 5.0 9.0</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="c1"># By going through with max, it sets True after the first time it meets null.</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNull</span><span class="p">())</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">),</span> |
| <span class="c1"># Manually sets nulls given the column defined above.</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">func</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="p">())</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">()))</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">),</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">,</span> <span class="n">skipna</span><span class="p">,</span> <span class="n">part_cols</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="p">())</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">))),</span> <span class="n">skipna</span><span class="p">,</span> <span class="n">part_cols</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="n">num_zeros</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)),</span> <span class="n">skipna</span><span class="p">,</span> <span class="n">part_cols</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">num_negatives</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span> <span class="o"><</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)),</span> <span class="n">skipna</span><span class="p">,</span> <span class="n">part_cols</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">sign</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">num_negatives</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">abs_prod</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">scol</span><span class="p">))),</span> <span class="n">skipna</span><span class="p">,</span> <span class="n">part_cols</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="p">)</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">num_zeros</span> <span class="o">></span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">sign</span> <span class="o">*</span> <span class="n">abs_prod</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">IntegralType</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| |
| <span class="c1"># ----------------------------------------------------------------------</span> |
| <span class="c1"># Accessor Methods</span> |
| <span class="c1"># ----------------------------------------------------------------------</span> |
| <span class="n">dt</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"dt"</span><span class="p">,</span> <span class="n">DatetimeMethods</span><span class="p">)</span> |
| <span class="nb">str</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"str"</span><span class="p">,</span> <span class="n">StringMethods</span><span class="p">)</span> |
| <span class="n">cat</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"cat"</span><span class="p">,</span> <span class="n">CategoricalAccessor</span><span class="p">)</span> |
| <span class="n">plot</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"plot"</span><span class="p">,</span> <span class="n">PandasOnSparkPlotAccessor</span><span class="p">)</span> |
| |
| <span class="c1"># ----------------------------------------------------------------------</span> |
| |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"Series"</span><span class="p">],</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Column</span><span class="p">]],</span> <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="n">psser_or_scol</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_scol</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psser_or_scol</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">psser_or_scol</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"Series"</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">name</span><span class="p">:</span> <span class="n">str_type</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Scalar</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Applies sfun to the column and returns a scalar</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sfun : the stats function to be used for aggregation</span> |
| <span class="sd"> name : original pandas API name.</span> |
| <span class="sd"> axis : used only for sanity check because series only support index axis.</span> |
| <span class="sd"> numeric_only : not used by this implementation, but passed down by stats functions</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"Series does not support columns axis."</span><span class="p">)</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">sfun</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="n">min_count</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"min_count"</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">min_count</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">>=</span> <span class="n">min_count</span><span class="p">,</span> <span class="n">scol</span><span class="p">)</span> |
| |
| <span class="n">result</span> <span class="o">=</span> <span class="n">unpack_scalar</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">scol</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">result</span> <span class="k">if</span> <span class="n">result</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> |
| |
| <span class="c1"># Override the `groupby` to specify the actual return type annotation.</span> |
| <div class="viewcode-block" id="Series.groupby"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.Series.groupby.html#pyspark.pandas.Series.groupby">[docs]</a> <span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]]],</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"SeriesGroupBy"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="s2">"SeriesGroupBy"</span><span class="p">,</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <span class="n">groupby</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">Frame</span><span class="o">.</span><span class="n">groupby</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="k">def</span> <span class="nf">_build_groupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"SeriesGroupBy"</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">SeriesGroupBy</span> |
| |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="o">.</span><span class="n">_build</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">by</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="k">if</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">slice</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">any</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> <span class="o">==</span> <span class="nb">int</span> <span class="k">for</span> <span class="n">n</span> <span class="ow">in</span> <span class="p">[</span><span class="n">key</span><span class="o">.</span><span class="n">start</span><span class="p">,</span> <span class="n">key</span><span class="o">.</span><span class="n">stop</span><span class="p">]))</span> <span class="ow">or</span> <span class="p">(</span> |
| <span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="o">==</span> <span class="nb">int</span> |
| <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">IntegerType</span><span class="p">,</span> <span class="n">LongType</span><span class="p">))</span> |
| <span class="p">):</span> |
| <span class="c1"># Seems like pandas Series always uses int as positional search when slicing</span> |
| <span class="c1"># with ints, searches based on index values when the value is int.</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> |
| <span class="k">except</span> <span class="n">SparkPandasIndexingError</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Key length (</span><span class="si">{}</span><span class="s2">) exceeds index depth (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">str_type</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">item</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"__"</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeries</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> |
| <span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeries</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="s2">"'Series' object has no attribute '</span><span class="si">{}</span><span class="s2">'"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">item</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">_to_internal_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a pandas Series directly from _internal to avoid overhead of copy.</span> |
| |
| <span class="sd"> This method is for internal use only.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">to_pandas_frame</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">str_type</span><span class="p">:</span> |
| <span class="n">max_display_count</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"display.max_rows"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">max_display_count</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span> |
| <span class="n">name</span><span class="o">=</span><span class="nb">bool</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="nb">bool</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">pser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_get_or_create_repr_pandas_cache</span><span class="p">(</span><span class="n">max_display_count</span><span class="p">)[</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">]</span> |
| <span class="n">pser_length</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">pser</span><span class="p">)</span> |
| <span class="n">pser</span> <span class="o">=</span> <span class="n">pser</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:</span><span class="n">max_display_count</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">pser_length</span> <span class="o">></span> <span class="n">max_display_count</span><span class="p">:</span> |
| <span class="n">repr_string</span> <span class="o">=</span> <span class="n">pser</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">length</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">rest</span><span class="p">,</span> <span class="n">prev_footer</span> <span class="o">=</span> <span class="n">repr_string</span><span class="o">.</span><span class="n">rsplit</span><span class="p">(</span><span class="s2">"</span><span class="se">\n</span><span class="s2">"</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> |
| <span class="n">match</span> <span class="o">=</span> <span class="n">REPR_PATTERN</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">prev_footer</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">length</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="s2">"length"</span><span class="p">)</span> |
| <span class="n">dtype_name</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">footer</span> <span class="o">=</span> <span class="s2">"</span><span class="se">\n</span><span class="s2">dtype: </span><span class="si">{dtype}</span><span class="se">\n</span><span class="s2">Showing only the first </span><span class="si">{length}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">length</span><span class="o">=</span><span class="n">length</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">pprint_thing</span><span class="p">(</span><span class="n">dtype_name</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">footer</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="s2">"</span><span class="se">\n</span><span class="s2">Name: </span><span class="si">{name}</span><span class="s2">, dtype: </span><span class="si">{dtype}</span><span class="s2">"</span> |
| <span class="s2">"</span><span class="se">\n</span><span class="s2">Showing only the first </span><span class="si">{length}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">length</span><span class="o">=</span><span class="n">length</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">pprint_thing</span><span class="p">(</span><span class="n">dtype_name</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">rest</span> <span class="o">+</span> <span class="n">footer</span> |
| <span class="k">return</span> <span class="n">pser</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__dir__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="n">str_type</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span> |
| <span class="n">fields</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">fields</span> <span class="o">=</span> <span class="p">[</span><span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="o">.</span><span class="n">fieldNames</span><span class="p">()</span> <span class="k">if</span> <span class="s2">" "</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">f</span><span class="p">]</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__dir__</span><span class="p">())</span> <span class="o">+</span> <span class="n">fields</span> |
| |
| <span class="k">def</span> <span class="fm">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MissingPandasLikeSeries</span><span class="o">.</span><span class="fm">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">sys</span><span class="o">.</span><span class="n">version_info</span> <span class="o">>=</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">7</span><span class="p">):</span> |
| <span class="c1"># In order to support the type hints such as Series[...]. See DataFrame.__class_getitem__.</span> |
| <span class="k">def</span> <span class="nf">__class_getitem__</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Type</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="n">create_type_for_series_type</span><span class="p">(</span><span class="n">params</span><span class="p">)</span> |
| |
| <span class="k">elif</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span> <span class="o"><=</span> <span class="n">sys</span><span class="o">.</span><span class="n">version_info</span> <span class="o"><</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">7</span><span class="p">):</span> |
| <span class="c1"># The implementation is in its metaclass so this flag is needed to distinguish</span> |
| <span class="c1"># pandas-on-Spark Series.</span> |
| <span class="n">is_series</span> <span class="o">=</span> <span class="kc">None</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">unpack_scalar</span><span class="p">(</span><span class="n">sdf</span><span class="p">:</span> <span class="n">SparkDataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Takes a dataframe that is supposed to contain a single row with a single scalar value,</span> |
| <span class="sd"> and returns this value.</span> |
| <span class="sd"> """</span> |
| <span class="n">lst</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">lst</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">lst</span><span class="p">)</span> |
| <span class="n">row</span> <span class="o">=</span> <span class="n">lst</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">lst2</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">lst2</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="p">(</span><span class="n">row</span><span class="p">,</span> <span class="n">lst2</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">lst2</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">first_series</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">first_series</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="k">def</span> <span class="nf">first_series</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Takes a DataFrame and returns the first column of the DataFrame as a Series</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)),</span> <span class="nb">type</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">df</span><span class="p">[</span><span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.pandas.series</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">series</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"ps"</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"pyspark.pandas.series tests"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">series</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |