| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.pandas.frame — PySpark 3.2.2 documentation</title> |
| |
| <link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.pandas.frame</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">A wrapper class for Spark DataFrame to behave similar to pandas DataFrame.</span> |
| <span class="sd">"""</span> |
| <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">OrderedDict</span><span class="p">,</span> <span class="n">defaultdict</span><span class="p">,</span> <span class="n">namedtuple</span> |
| <span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Mapping</span> |
| <span class="kn">from</span> <span class="nn">distutils.version</span> <span class="kn">import</span> <span class="n">LooseVersion</span> |
| <span class="kn">import</span> <span class="nn">re</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| <span class="kn">import</span> <span class="nn">inspect</span> |
| <span class="kn">import</span> <span class="nn">json</span> |
| <span class="kn">import</span> <span class="nn">types</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">partial</span><span class="p">,</span> <span class="n">reduce</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">zip_longest</span> |
| <span class="kn">from</span> <span class="nn">types</span> <span class="kn">import</span> <span class="n">TracebackType</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Generic</span><span class="p">,</span> |
| <span class="n">IO</span><span class="p">,</span> |
| <span class="n">Iterable</span><span class="p">,</span> |
| <span class="n">Iterator</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Sequence</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Type</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">no_type_check</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">import</span> <span class="nn">datetime</span> |
| |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_list_like</span><span class="p">,</span> <span class="n">is_dict_like</span><span class="p">,</span> <span class="n">is_scalar</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.extensions</span> <span class="kn">import</span> <span class="n">ExtensionDtype</span> |
| <span class="kn">from</span> <span class="nn">pandas.tseries.frequencies</span> <span class="kn">import</span> <span class="n">DateOffset</span><span class="p">,</span> <span class="n">to_offset</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pandas.io.formats.style</span> <span class="kn">import</span> <span class="n">Styler</span> <span class="c1"># noqa: F401 (SPARK-34943)</span> |
| |
| <span class="k">if</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">__version__</span><span class="p">)</span> <span class="o">>=</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="s2">"0.24"</span><span class="p">):</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.dtypes.common</span> <span class="kn">import</span> <span class="n">infer_dtype_from_object</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.dtypes.common</span> <span class="kn">import</span> <span class="n">_get_dtype_from_object</span> <span class="k">as</span> <span class="n">infer_dtype_from_object</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.accessor</span> <span class="kn">import</span> <span class="n">CachedAccessor</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.dtypes.inference</span> <span class="kn">import</span> <span class="n">is_sequence</span> |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">StorageLevel</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># noqa: F401 (SPARK-34943)</span> |
| <span class="n">ArrayType</span><span class="p">,</span> |
| <span class="n">BooleanType</span><span class="p">,</span> |
| <span class="n">DataType</span><span class="p">,</span> |
| <span class="n">DoubleType</span><span class="p">,</span> |
| <span class="n">FloatType</span><span class="p">,</span> |
| <span class="n">NumericType</span><span class="p">,</span> |
| <span class="n">Row</span><span class="p">,</span> |
| <span class="n">StringType</span><span class="p">,</span> |
| <span class="n">StructField</span><span class="p">,</span> |
| <span class="n">StructType</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.window</span> <span class="kn">import</span> <span class="n">Window</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Scalar</span><span class="p">,</span> <span class="n">T</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.accessors</span> <span class="kn">import</span> <span class="n">PandasOnSparkFrameMethods</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">option_context</span><span class="p">,</span> <span class="n">get_option</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark.accessors</span> <span class="kn">import</span> <span class="n">SparkFrameMethods</span><span class="p">,</span> <span class="n">CachedSparkFrameMethods</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">align_diff_frames</span><span class="p">,</span> |
| <span class="n">column_labels_level</span><span class="p">,</span> |
| <span class="n">combine_frames</span><span class="p">,</span> |
| <span class="n">default_session</span><span class="p">,</span> |
| <span class="n">is_name_like_tuple</span><span class="p">,</span> |
| <span class="n">is_name_like_value</span><span class="p">,</span> |
| <span class="n">is_testing</span><span class="p">,</span> |
| <span class="n">name_like_string</span><span class="p">,</span> |
| <span class="n">same_anchor</span><span class="p">,</span> |
| <span class="n">scol_for</span><span class="p">,</span> |
| <span class="n">validate_arguments_and_invoke_function</span><span class="p">,</span> |
| <span class="n">validate_axis</span><span class="p">,</span> |
| <span class="n">validate_bool_kwarg</span><span class="p">,</span> |
| <span class="n">validate_how</span><span class="p">,</span> |
| <span class="n">validate_mode</span><span class="p">,</span> |
| <span class="n">verify_temp_column_name</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.generic</span> <span class="kn">import</span> <span class="n">Frame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">InternalField</span><span class="p">,</span> |
| <span class="n">InternalFrame</span><span class="p">,</span> |
| <span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span> |
| <span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.missing.frame</span> <span class="kn">import</span> <span class="n">_MissingPandasLikeDataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.ml</span> <span class="kn">import</span> <span class="n">corr</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.typedef.typehints</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">as_spark_type</span><span class="p">,</span> |
| <span class="n">infer_return_type</span><span class="p">,</span> |
| <span class="n">pandas_on_spark_type</span><span class="p">,</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">,</span> |
| <span class="n">DataFrameType</span><span class="p">,</span> |
| <span class="n">SeriesType</span><span class="p">,</span> |
| <span class="n">ScalarType</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.plot</span> <span class="kn">import</span> <span class="n">PandasOnSparkPlotAccessor</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="n">OptionalPrimitiveType</span> <span class="c1"># noqa: F401 (SPARK-34943)</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">DataFrameGroupBy</span> <span class="c1"># noqa: F401 (SPARK-34943)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span> <span class="c1"># noqa: F401 (SPARK-34943)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> <span class="c1"># noqa: F401 (SPARK-34943)</span> |
| |
| |
| <span class="c1"># These regular expression patterns are complied and defined here to avoid to compile the same</span> |
| <span class="c1"># pattern every time it is used in _repr_ and _repr_html_ in DataFrame.</span> |
| <span class="c1"># Two patterns basically seek the footer string from Pandas'</span> |
| <span class="n">REPR_PATTERN</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s2">"\n\n\[(?P<rows>[0-9]+) rows x (?P<columns>[0-9]+) columns\]$"</span><span class="p">)</span> |
| <span class="n">REPR_HTML_PATTERN</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span> |
| <span class="sa">r</span><span class="s2">"\n\<p\>(?P<rows>[0-9]+) rows × (?P<columns>[0-9]+) columns\<\/p\>\n\<\/div\>$"</span> |
| <span class="p">)</span> |
| |
| |
| <span class="n">_flex_doc_FRAME</span> <span class="o">=</span> <span class="s2">"""</span> |
| <span class="s2">Get </span><span class="si">{desc}</span><span class="s2"> of dataframe and other, element-wise (binary operator `</span><span class="si">{op_name}</span><span class="s2">`).</span> |
| |
| <span class="s2">Equivalent to ``</span><span class="si">{equiv}</span><span class="s2">``. With reverse version, `</span><span class="si">{reverse}</span><span class="s2">`.</span> |
| |
| <span class="s2">Among flexible wrappers (`add`, `sub`, `mul`, `div`) to</span> |
| <span class="s2">arithmetic operators: `+`, `-`, `*`, `/`, `//`.</span> |
| |
| <span class="s2">Parameters</span> |
| <span class="s2">----------</span> |
| <span class="s2">other : scalar</span> |
| <span class="s2"> Any single data</span> |
| |
| <span class="s2">Returns</span> |
| <span class="s2">-------</span> |
| <span class="s2">DataFrame</span> |
| <span class="s2"> Result of the arithmetic operation.</span> |
| |
| <span class="s2">Examples</span> |
| <span class="s2">--------</span> |
| <span class="s2">>>> df = ps.DataFrame({{'angles': [0, 3, 4],</span> |
| <span class="s2">... 'degrees': [360, 180, 360]}},</span> |
| <span class="s2">... index=['circle', 'triangle', 'rectangle'],</span> |
| <span class="s2">... columns=['angles', 'degrees'])</span> |
| <span class="s2">>>> df</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 360</span> |
| <span class="s2">triangle 3 180</span> |
| <span class="s2">rectangle 4 360</span> |
| |
| <span class="s2">Add a scalar with operator version which return the same</span> |
| <span class="s2">results. Also reverse version.</span> |
| |
| <span class="s2">>>> df + 1</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 1 361</span> |
| <span class="s2">triangle 4 181</span> |
| <span class="s2">rectangle 5 361</span> |
| |
| <span class="s2">>>> df.add(1)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 1 361</span> |
| <span class="s2">triangle 4 181</span> |
| <span class="s2">rectangle 5 361</span> |
| |
| <span class="s2">>>> df.add(df)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 720</span> |
| <span class="s2">triangle 6 360</span> |
| <span class="s2">rectangle 8 720</span> |
| |
| <span class="s2">>>> df + df + df</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 1080</span> |
| <span class="s2">triangle 9 540</span> |
| <span class="s2">rectangle 12 1080</span> |
| |
| <span class="s2">>>> df.radd(1)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 1 361</span> |
| <span class="s2">triangle 4 181</span> |
| <span class="s2">rectangle 5 361</span> |
| |
| <span class="s2">Divide and true divide by constant with reverse version.</span> |
| |
| <span class="s2">>>> df / 10</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 36.0</span> |
| <span class="s2">triangle 0.3 18.0</span> |
| <span class="s2">rectangle 0.4 36.0</span> |
| |
| <span class="s2">>>> df.div(10)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 36.0</span> |
| <span class="s2">triangle 0.3 18.0</span> |
| <span class="s2">rectangle 0.4 36.0</span> |
| |
| <span class="s2">>>> df.rdiv(10)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle inf 0.027778</span> |
| <span class="s2">triangle 3.333333 0.055556</span> |
| <span class="s2">rectangle 2.500000 0.027778</span> |
| |
| <span class="s2">>>> df.truediv(10)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 36.0</span> |
| <span class="s2">triangle 0.3 18.0</span> |
| <span class="s2">rectangle 0.4 36.0</span> |
| |
| <span class="s2">>>> df.rtruediv(10)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle inf 0.027778</span> |
| <span class="s2">triangle 3.333333 0.055556</span> |
| <span class="s2">rectangle 2.500000 0.027778</span> |
| |
| <span class="s2">Subtract by constant with reverse version.</span> |
| |
| <span class="s2">>>> df - 1</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle -1 359</span> |
| <span class="s2">triangle 2 179</span> |
| <span class="s2">rectangle 3 359</span> |
| |
| <span class="s2">>>> df.sub(1)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle -1 359</span> |
| <span class="s2">triangle 2 179</span> |
| <span class="s2">rectangle 3 359</span> |
| |
| <span class="s2">>>> df.rsub(1)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 1 -359</span> |
| <span class="s2">triangle -2 -179</span> |
| <span class="s2">rectangle -3 -359</span> |
| |
| <span class="s2">Multiply by constant with reverse version.</span> |
| |
| <span class="s2">>>> df * 1</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 360</span> |
| <span class="s2">triangle 3 180</span> |
| <span class="s2">rectangle 4 360</span> |
| |
| <span class="s2">>>> df.mul(1)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 360</span> |
| <span class="s2">triangle 3 180</span> |
| <span class="s2">rectangle 4 360</span> |
| |
| <span class="s2">>>> df.rmul(1)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 360</span> |
| <span class="s2">triangle 3 180</span> |
| <span class="s2">rectangle 4 360</span> |
| |
| <span class="s2">Floor Divide by constant with reverse version.</span> |
| |
| <span class="s2">>>> df // 10</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 36.0</span> |
| <span class="s2">triangle 0.0 18.0</span> |
| <span class="s2">rectangle 0.0 36.0</span> |
| |
| <span class="s2">>>> df.floordiv(10)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 36.0</span> |
| <span class="s2">triangle 0.0 18.0</span> |
| <span class="s2">rectangle 0.0 36.0</span> |
| |
| <span class="s2">>>> df.rfloordiv(10) # doctest: +SKIP</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle inf 0.0</span> |
| <span class="s2">triangle 3.0 0.0</span> |
| <span class="s2">rectangle 2.0 0.0</span> |
| |
| <span class="s2">Mod by constant with reverse version.</span> |
| |
| <span class="s2">>>> df % 2</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 0</span> |
| <span class="s2">triangle 1 0</span> |
| <span class="s2">rectangle 0 0</span> |
| |
| <span class="s2">>>> df.mod(2)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0 0</span> |
| <span class="s2">triangle 1 0</span> |
| <span class="s2">rectangle 0 0</span> |
| |
| <span class="s2">>>> df.rmod(2)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle NaN 2</span> |
| <span class="s2">triangle 2.0 2</span> |
| <span class="s2">rectangle 2.0 2</span> |
| |
| <span class="s2">Power by constant with reverse version.</span> |
| |
| <span class="s2">>>> df ** 2</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 129600.0</span> |
| <span class="s2">triangle 9.0 32400.0</span> |
| <span class="s2">rectangle 16.0 129600.0</span> |
| |
| <span class="s2">>>> df.pow(2)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 0.0 129600.0</span> |
| <span class="s2">triangle 9.0 32400.0</span> |
| <span class="s2">rectangle 16.0 129600.0</span> |
| |
| <span class="s2">>>> df.rpow(2)</span> |
| <span class="s2"> angles degrees</span> |
| <span class="s2">circle 1.0 2.348543e+108</span> |
| <span class="s2">triangle 8.0 1.532496e+54</span> |
| <span class="s2">rectangle 16.0 2.348543e+108</span> |
| <span class="s2">"""</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_create_tuple_for_frame_type</span><span class="p">(</span><span class="n">params</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">object</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> This is a workaround to support variadic generic in DataFrame.</span> |
| |
| <span class="sd"> See https://github.com/python/typing/issues/193</span> |
| <span class="sd"> we always wraps the given type hints by a tuple to mimic the variadic generic.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">NameTypeHolder</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="nb">zip</span><span class="p">):</span> <span class="c1"># type: ignore</span> |
| <span class="n">params</span> <span class="o">=</span> <span class="p">[</span><span class="nb">slice</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">tpe</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">tpe</span> <span class="ow">in</span> <span class="n">params</span><span class="p">]</span> <span class="c1"># type: ignore</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="nb">slice</span><span class="p">):</span> |
| <span class="n">params</span> <span class="o">=</span> <span class="p">(</span><span class="n">params</span><span class="p">,)</span> |
| |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="nb">hasattr</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="s2">"__len__"</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">param</span><span class="p">,</span> <span class="nb">slice</span><span class="p">)</span> <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">params</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">params</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">param</span><span class="o">.</span><span class="n">start</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="n">param</span><span class="o">.</span><span class="n">step</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Type hints should be specified as "</span> |
| <span class="s2">"DataFrame['name': type]; however, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">param</span> |
| <span class="p">)</span> |
| |
| <span class="n">name_classes</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">params</span><span class="p">:</span> |
| <span class="n">new_class</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="s2">"NameType"</span><span class="p">,</span> <span class="p">(</span><span class="n">NameTypeHolder</span><span class="p">,),</span> <span class="p">{})</span> <span class="c1"># type: Type[NameTypeHolder]</span> |
| <span class="n">new_class</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">param</span><span class="o">.</span><span class="n">start</span> |
| <span class="c1"># When the given argument is a numpy's dtype instance.</span> |
| <span class="n">new_class</span><span class="o">.</span><span class="n">tpe</span> <span class="o">=</span> <span class="n">param</span><span class="o">.</span><span class="n">stop</span><span class="o">.</span><span class="n">type</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">param</span><span class="o">.</span><span class="n">stop</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> <span class="k">else</span> <span class="n">param</span><span class="o">.</span><span class="n">stop</span> |
| <span class="n">name_classes</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">new_class</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="n">name_classes</span><span class="p">)]</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">params</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">):</span> |
| <span class="n">params</span> <span class="o">=</span> <span class="p">[</span><span class="n">params</span><span class="p">]</span> |
| |
| <span class="n">new_params</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">params</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">param</span><span class="p">,</span> <span class="n">ExtensionDtype</span><span class="p">):</span> |
| <span class="n">new_class</span> <span class="o">=</span> <span class="nb">type</span><span class="p">(</span><span class="s2">"NameType"</span><span class="p">,</span> <span class="p">(</span><span class="n">NameTypeHolder</span><span class="p">,),</span> <span class="p">{})</span> |
| <span class="n">new_class</span><span class="o">.</span><span class="n">tpe</span> <span class="o">=</span> <span class="n">param</span> |
| <span class="n">new_params</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">new_class</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_params</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">param</span><span class="o">.</span><span class="n">type</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">param</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> <span class="k">else</span> <span class="n">param</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="n">new_params</span><span class="p">)]</span> |
| |
| |
| <span class="k">if</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span> <span class="o"><=</span> <span class="n">sys</span><span class="o">.</span><span class="n">version_info</span> <span class="o"><</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">7</span><span class="p">)</span> <span class="ow">and</span> <span class="vm">__name__</span> <span class="o">!=</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">GenericMeta</span> <span class="c1"># type: ignore</span> |
| |
| <span class="c1"># This is a workaround to support variadic generic in DataFrame in Python 3.5+.</span> |
| <span class="c1"># See https://github.com/python/typing/issues/193</span> |
| <span class="c1"># We wrap the input params by a tuple to mimic variadic generic.</span> |
| <span class="n">old_getitem</span> <span class="o">=</span> <span class="n">GenericMeta</span><span class="o">.</span><span class="fm">__getitem__</span> <span class="c1"># type: ignore</span> |
| |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">new_getitem</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">params</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"is_dataframe"</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">old_getitem</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">_create_tuple_for_frame_type</span><span class="p">(</span><span class="n">params</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">old_getitem</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span> |
| |
| <span class="n">GenericMeta</span><span class="o">.</span><span class="fm">__getitem__</span> <span class="o">=</span> <span class="n">new_getitem</span> <span class="c1"># type: ignore</span> |
| |
| |
| <div class="viewcode-block" id="DataFrame"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.html#pyspark.pandas.DataFrame">[docs]</a><span class="k">class</span> <span class="nc">DataFrame</span><span class="p">(</span><span class="n">Frame</span><span class="p">,</span> <span class="n">Generic</span><span class="p">[</span><span class="n">T</span><span class="p">]):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> pandas-on-Spark DataFrame that corresponds to pandas DataFrame logically. This holds Spark</span> |
| <span class="sd"> DataFrame internally.</span> |
| |
| <span class="sd"> :ivar _internal: an internal immutable Frame to manage metadata.</span> |
| <span class="sd"> :type _internal: InternalFrame</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : numpy ndarray (structured or homogeneous), dict, pandas DataFrame, Spark DataFrame \</span> |
| <span class="sd"> or pandas-on-Spark Series</span> |
| <span class="sd"> Dict can contain Series, arrays, constants, or list-like objects</span> |
| <span class="sd"> If data is a dict, argument order is maintained for Python 3.6</span> |
| <span class="sd"> and later.</span> |
| <span class="sd"> Note that if `data` is a pandas DataFrame, a Spark DataFrame, and a pandas-on-Spark Series,</span> |
| <span class="sd"> other arguments should not be used.</span> |
| <span class="sd"> index : Index or array-like</span> |
| <span class="sd"> Index to use for resulting frame. Will default to RangeIndex if</span> |
| <span class="sd"> no indexing information part of input data and no index provided</span> |
| <span class="sd"> columns : Index or array-like</span> |
| <span class="sd"> Column labels to use for resulting frame. Will default to</span> |
| <span class="sd"> RangeIndex (0, 1, 2, ..., n) if no column labels are provided</span> |
| <span class="sd"> dtype : dtype, default None</span> |
| <span class="sd"> Data type to force. Only a single dtype is allowed. If None, infer</span> |
| <span class="sd"> copy : boolean, default False</span> |
| <span class="sd"> Copy data from inputs. Only affects DataFrame / 2d ndarray input</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Constructing DataFrame from a dictionary.</span> |
| |
| <span class="sd"> >>> d = {'col1': [1, 2], 'col2': [3, 4]}</span> |
| <span class="sd"> >>> df = ps.DataFrame(data=d, columns=['col1', 'col2'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> col1 col2</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| |
| <span class="sd"> Constructing DataFrame from pandas DataFrame</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame(pd.DataFrame(data=d, columns=['col1', 'col2']))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> col1 col2</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| |
| <span class="sd"> Notice that the inferred dtype is int64.</span> |
| |
| <span class="sd"> >>> df.dtypes</span> |
| <span class="sd"> col1 int64</span> |
| <span class="sd"> col2 int64</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> To enforce a single dtype:</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame(data=d, dtype=np.int8)</span> |
| <span class="sd"> >>> df.dtypes</span> |
| <span class="sd"> col1 int8</span> |
| <span class="sd"> col2 int8</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> Constructing DataFrame from numpy ndarray:</span> |
| |
| <span class="sd"> >>> df2 = ps.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),</span> |
| <span class="sd"> ... columns=['a', 'b', 'c', 'd', 'e'])</span> |
| <span class="sd"> >>> df2 # doctest: +SKIP</span> |
| <span class="sd"> a b c d e</span> |
| <span class="sd"> 0 3 1 4 9 8</span> |
| <span class="sd"> 1 4 8 4 8 4</span> |
| <span class="sd"> 2 7 6 5 6 7</span> |
| <span class="sd"> 3 8 7 9 1 0</span> |
| <span class="sd"> 4 2 5 4 3 9</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">InternalFrame</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">copy</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">data</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">SparkDataFrame</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">copy</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span><span class="n">spark_frame</span><span class="o">=</span><span class="n">data</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">copy</span> |
| <span class="n">data</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">assert</span> <span class="ow">not</span> <span class="n">copy</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">data</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="n">data</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="n">copy</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_internal_frame"</span><span class="p">,</span> <span class="n">internal</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_pssers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="n">Label</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]:</span> |
| <span class="sd">"""Return a dict of column label -> Series which anchors `self`."""</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_psseries"</span><span class="p">):</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="s2">"_psseries"</span><span class="p">,</span> |
| <span class="p">{</span><span class="n">label</span><span class="p">:</span> <span class="n">Series</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">},</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psseries</span> <span class="c1"># type: ignore</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">psseries</span><span class="p">),</span> <span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">),</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">psseries</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="bp">self</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">psseries</span><span class="o">.</span><span class="n">values</span><span class="p">()):</span> |
| <span class="c1"># Refresh the dict to contain only Series anchoring `self`.</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psseries</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">label</span><span class="p">:</span> <span class="n">psseries</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> |
| <span class="k">if</span> <span class="bp">self</span> <span class="ow">is</span> <span class="n">psseries</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="k">else</span> <span class="n">Series</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">}</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psseries</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal_frame</span> <span class="c1"># type: ignore</span> |
| |
| <span class="k">def</span> <span class="nf">_update_internal_frame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">internal</span><span class="p">:</span> <span class="n">InternalFrame</span><span class="p">,</span> <span class="n">requires_same_anchor</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Update InternalFrame with the given one.</span> |
| |
| <span class="sd"> If the column_label is changed or the new InternalFrame is not the same `anchor`,</span> |
| <span class="sd"> disconnect the link to the Series and create a new one.</span> |
| |
| <span class="sd"> If `requires_same_anchor` is `False`, checking whether or not the same anchor is ignored</span> |
| <span class="sd"> and force to update the InternalFrame, e.g., replacing the internal with the resolved_copy,</span> |
| <span class="sd"> updating the underlying Spark DataFrame which need to combine a different Spark DataFrame.</span> |
| |
| <span class="sd"> :param internal: the new InternalFrame</span> |
| <span class="sd"> :param requires_same_anchor: whether checking the same anchor</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_psseries"</span><span class="p">):</span> |
| <span class="n">psseries</span> <span class="o">=</span> <span class="p">{}</span> |
| |
| <span class="k">for</span> <span class="n">old_label</span><span class="p">,</span> <span class="n">new_label</span> <span class="ow">in</span> <span class="n">zip_longest</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">):</span> |
| <span class="k">if</span> <span class="n">old_label</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pssers</span><span class="p">[</span><span class="n">old_label</span><span class="p">]</span> |
| |
| <span class="n">renamed</span> <span class="o">=</span> <span class="n">old_label</span> <span class="o">!=</span> <span class="n">new_label</span> |
| <span class="n">not_same_anchor</span> <span class="o">=</span> <span class="n">requires_same_anchor</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">internal</span><span class="p">,</span> <span class="n">psser</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">renamed</span> <span class="ow">or</span> <span class="n">not_same_anchor</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">select_column</span><span class="p">(</span><span class="n">old_label</span><span class="p">))</span> <span class="c1"># type: DataFrame</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_update_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">new_label</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">psser</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">Series</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">new_label</span><span class="p">)</span> |
| <span class="n">psseries</span><span class="p">[</span><span class="n">new_label</span><span class="p">]</span> <span class="o">=</span> <span class="n">psser</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psseries</span> <span class="o">=</span> <span class="n">psseries</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal_frame</span> <span class="o">=</span> <span class="n">internal</span> |
| |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_repr_pandas_cache"</span><span class="p">):</span> |
| <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_repr_pandas_cache</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">ndim</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return an int representing the number of array dimensions.</span> |
| |
| <span class="sd"> return 2 for DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],</span> |
| <span class="sd"> ... index=['cobra', 'viper', None],</span> |
| <span class="sd"> ... columns=['max_speed', 'shield'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> max_speed shield</span> |
| <span class="sd"> cobra 1 2</span> |
| <span class="sd"> viper 4 5</span> |
| <span class="sd"> NaN 7 8</span> |
| <span class="sd"> >>> df.ndim</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="mi">2</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">axes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a list representing the axes of the DataFrame.</span> |
| |
| <span class="sd"> It has the row axis labels and column axis labels as the only members.</span> |
| <span class="sd"> They are returned in that order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4]})</span> |
| <span class="sd"> >>> df.axes</span> |
| <span class="sd"> [Int64Index([0, 1], dtype='int64'), Index(['col1', 'col2'], dtype='object')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"Series"</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Applies sfun to each column and returns a pd.Series where the number of rows equal the</span> |
| <span class="sd"> number of columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sfun : either an 1-arg function that takes a Column and returns a Column, or</span> |
| <span class="sd"> a 2-arg function that takes a Column and its DataType and returns a Column.</span> |
| <span class="sd"> axis: used only for sanity check because series only support index axis.</span> |
| <span class="sd"> name : original pandas API name.</span> |
| <span class="sd"> axis : axis to apply. 0 or 1, or 'index' or 'columns.</span> |
| <span class="sd"> numeric_only : bool, default True</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility. Only 'DataFrame.count' uses this parameter</span> |
| <span class="sd"> currently.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">min_count</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"min_count"</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> |
| |
| <span class="n">exprs</span> <span class="o">=</span> <span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">StringType</span><span class="p">())</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)]</span> |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="n">is_numeric_or_boolean</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">keep_column</span> <span class="o">=</span> <span class="ow">not</span> <span class="n">numeric_only</span> <span class="ow">or</span> <span class="n">is_numeric_or_boolean</span> |
| |
| <span class="k">if</span> <span class="n">keep_column</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">sfun</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">min_count</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> <span class="o">>=</span> <span class="n">min_count</span><span class="p">,</span> <span class="n">scol</span><span class="p">)</span> |
| |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span> |
| <span class="n">new_column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">exprs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">Series</span><span class="p">([])</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span> |
| |
| <span class="c1"># The data is expected to be small so it's fine to transpose/use default index.</span> |
| <span class="k">with</span> <span class="n">ps</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.max_rows"</span><span class="p">,</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">new_column_labels</span><span class="p">,</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">())</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">pser</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">pdf</span><span class="p">,</span> <span class="n">name</span><span class="p">)(</span><span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">Series</span><span class="p">(</span><span class="n">pser</span><span class="p">)</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="n">as_spark_type</span><span class="p">(</span><span class="n">pser</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">type</span><span class="p">))</span> <span class="c1"># type: ignore</span> |
| <span class="k">def</span> <span class="nf">calculate_columns_axis</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">),</span> <span class="n">name</span><span class="p">)(</span> |
| <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span> |
| <span class="p">)</span> |
| |
| <span class="n">column_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">),</span> |
| <span class="s2">"__calculate_columns_axis__"</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">calculate_columns_axis</span><span class="p">(</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">column_name</span><span class="p">)]</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">pser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_psser_for</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Create Series with a proper column label.</span> |
| |
| <span class="sd"> The given label must be verified to exist in `InternalFrame.column_labels`.</span> |
| |
| <span class="sd"> For example, in some method, self is like:</span> |
| |
| <span class="sd"> >>> self = ps.range(3)</span> |
| |
| <span class="sd"> `self._psser_for(label)` can be used with `InternalFrame.column_labels`:</span> |
| |
| <span class="sd"> >>> self._psser_for(self._internal.column_labels[0])</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> Name: id, dtype: int64</span> |
| |
| <span class="sd"> `self._psser_for(label)` must not be used directly with user inputs.</span> |
| <span class="sd"> In that case, `self[label]` should be used instead, which checks the label exists or not:</span> |
| |
| <span class="sd"> >>> self['id']</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> Name: id, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_pssers</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"Series"</span><span class="p">],</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Column</span><span class="p">]],</span> <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="c1"># Arithmetic Operators</span> |
| <span class="k">def</span> <span class="nf">_map_series_op</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">op</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.base</span> <span class="kn">import</span> <span class="n">IndexOpsMixin</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_sequence</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"</span><span class="si">%s</span><span class="s2"> with a sequence is currently not supported; "</span> |
| <span class="s2">"however, got </span><span class="si">%s</span><span class="s2">."</span> <span class="o">%</span> <span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"cannot join with no overlapping index names"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="c1"># Different DataFrames</span> |
| <span class="k">def</span> <span class="nf">apply_op</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> |
| <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span> |
| <span class="k">for</span> <span class="n">this_label</span><span class="p">,</span> <span class="n">that_label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">this_column_labels</span><span class="p">,</span> <span class="n">that_column_labels</span><span class="p">):</span> |
| <span class="k">yield</span> <span class="p">(</span> |
| <span class="nb">getattr</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">this_label</span><span class="p">),</span> <span class="n">op</span><span class="p">)(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">that_label</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">this_label</span><span class="p">),</span> |
| <span class="n">this_label</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">align_diff_frames</span><span class="p">(</span><span class="n">apply_op</span><span class="p">,</span> <span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">,</span> <span class="n">fillna</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">"full"</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">op</span><span class="p">)(</span><span class="n">other</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">op</span><span class="p">)(</span><span class="n">other</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="fm">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__radd__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"radd"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__truediv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"truediv"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__rtruediv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"rtruediv"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"mul"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__rmul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"rmul"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__sub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"sub"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__rsub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"rsub"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__pow__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"pow"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__rpow__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"rpow"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__mod__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"mod"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__rmod__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"rmod"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__floordiv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"floordiv"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__rfloordiv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"rfloordiv"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__abs__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="nb">abs</span><span class="p">(</span><span class="n">psser</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="fm">__neg__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="o">-</span><span class="n">psser</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.add"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.add.html#pyspark.pandas.DataFrame.add">[docs]</a> <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">+</span> <span class="n">other</span></div> |
| |
| <span class="c1"># create accessor for plot</span> |
| <span class="n">plot</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"plot"</span><span class="p">,</span> <span class="n">PandasOnSparkPlotAccessor</span><span class="p">)</span> |
| |
| <span class="c1"># create accessor for Spark related methods.</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"spark"</span><span class="p">,</span> <span class="n">SparkFrameMethods</span><span class="p">)</span> |
| |
| <span class="c1"># create accessor for pandas-on-Spark specific methods.</span> |
| <span class="n">pandas_on_spark</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"pandas_on_spark"</span><span class="p">,</span> <span class="n">PandasOnSparkFrameMethods</span><span class="p">)</span> |
| |
| <span class="c1"># keep the name "koalas" for backward compatibility.</span> |
| <span class="n">koalas</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"koalas"</span><span class="p">,</span> <span class="n">PandasOnSparkFrameMethods</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.hist"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.hist.html#pyspark.pandas.DataFrame.hist">[docs]</a> <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">hist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bins</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">plot</span><span class="o">.</span><span class="n">hist</span><span class="p">(</span><span class="n">bins</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">)</span></div> |
| |
| <span class="n">hist</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">PandasOnSparkPlotAccessor</span><span class="o">.</span><span class="n">hist</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrame.kde"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.kde.html#pyspark.pandas.DataFrame.kde">[docs]</a> <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">kde</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bw_method</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">ind</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">plot</span><span class="o">.</span><span class="n">kde</span><span class="p">(</span><span class="n">bw_method</span><span class="p">,</span> <span class="n">ind</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">)</span></div> |
| |
| <span class="n">kde</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">PandasOnSparkPlotAccessor</span><span class="o">.</span><span class="n">kde</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="n">add</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Addition"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"+"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe + other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"radd"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.radd"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.radd.html#pyspark.pandas.DataFrame.radd">[docs]</a> <span class="k">def</span> <span class="nf">radd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">+</span> <span class="bp">self</span></div> |
| |
| <span class="n">radd</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Addition"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"+"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other + dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"add"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.div"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.div.html#pyspark.pandas.DataFrame.div">[docs]</a> <span class="k">def</span> <span class="nf">div</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">/</span> <span class="n">other</span></div> |
| |
| <span class="n">div</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Floating division"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe / other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rdiv"</span> |
| <span class="p">)</span> |
| |
| <span class="n">divide</span> <span class="o">=</span> <span class="n">div</span> |
| |
| <div class="viewcode-block" id="DataFrame.rdiv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rdiv.html#pyspark.pandas.DataFrame.rdiv">[docs]</a> <span class="k">def</span> <span class="nf">rdiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">/</span> <span class="bp">self</span></div> |
| |
| <span class="n">rdiv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Floating division"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other / dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"div"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.truediv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.truediv.html#pyspark.pandas.DataFrame.truediv">[docs]</a> <span class="k">def</span> <span class="nf">truediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">/</span> <span class="n">other</span></div> |
| |
| <span class="n">truediv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Floating division"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe / other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rtruediv"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.rtruediv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rtruediv.html#pyspark.pandas.DataFrame.rtruediv">[docs]</a> <span class="k">def</span> <span class="nf">rtruediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">/</span> <span class="bp">self</span></div> |
| |
| <span class="n">rtruediv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Floating division"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"/"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other / dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"truediv"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.mul"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.mul.html#pyspark.pandas.DataFrame.mul">[docs]</a> <span class="k">def</span> <span class="nf">mul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">*</span> <span class="n">other</span></div> |
| |
| <span class="n">mul</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Multiplication"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"*"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe * other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rmul"</span> |
| <span class="p">)</span> |
| |
| <span class="n">multiply</span> <span class="o">=</span> <span class="n">mul</span> |
| |
| <div class="viewcode-block" id="DataFrame.rmul"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rmul.html#pyspark.pandas.DataFrame.rmul">[docs]</a> <span class="k">def</span> <span class="nf">rmul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">*</span> <span class="bp">self</span></div> |
| |
| <span class="n">rmul</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Multiplication"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"*"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other * dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"mul"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.sub"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.sub.html#pyspark.pandas.DataFrame.sub">[docs]</a> <span class="k">def</span> <span class="nf">sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">-</span> <span class="n">other</span></div> |
| |
| <span class="n">sub</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Subtraction"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"-"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe - other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rsub"</span> |
| <span class="p">)</span> |
| |
| <span class="n">subtract</span> <span class="o">=</span> <span class="n">sub</span> |
| |
| <div class="viewcode-block" id="DataFrame.rsub"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rsub.html#pyspark.pandas.DataFrame.rsub">[docs]</a> <span class="k">def</span> <span class="nf">rsub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">-</span> <span class="bp">self</span></div> |
| |
| <span class="n">rsub</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Subtraction"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"-"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other - dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"sub"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.mod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.mod.html#pyspark.pandas.DataFrame.mod">[docs]</a> <span class="k">def</span> <span class="nf">mod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">%</span> <span class="n">other</span></div> |
| |
| <span class="n">mod</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Modulo"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"%"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe </span><span class="si">% o</span><span class="s2">ther"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rmod"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.rmod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rmod.html#pyspark.pandas.DataFrame.rmod">[docs]</a> <span class="k">def</span> <span class="nf">rmod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">%</span> <span class="bp">self</span></div> |
| |
| <span class="n">rmod</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Modulo"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"%"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other </span><span class="si">% d</span><span class="s2">ataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"mod"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.pow"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.pow.html#pyspark.pandas.DataFrame.pow">[docs]</a> <span class="k">def</span> <span class="nf">pow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">**</span> <span class="n">other</span></div> |
| |
| <span class="nb">pow</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Exponential power of series"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"**"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe ** other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rpow"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.rpow"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rpow.html#pyspark.pandas.DataFrame.rpow">[docs]</a> <span class="k">def</span> <span class="nf">rpow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">**</span> <span class="bp">self</span></div> |
| |
| <span class="n">rpow</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Exponential power"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"**"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other ** dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"pow"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.floordiv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.floordiv.html#pyspark.pandas.DataFrame.floordiv">[docs]</a> <span class="k">def</span> <span class="nf">floordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">//</span> <span class="n">other</span></div> |
| |
| <span class="n">floordiv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Integer division"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"//"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"dataframe // other"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"rfloordiv"</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.rfloordiv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rfloordiv.html#pyspark.pandas.DataFrame.rfloordiv">[docs]</a> <span class="k">def</span> <span class="nf">rfloordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">other</span> <span class="o">//</span> <span class="bp">self</span></div> |
| |
| <span class="n">rfloordiv</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">_flex_doc_FRAME</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">desc</span><span class="o">=</span><span class="s2">"Integer division"</span><span class="p">,</span> <span class="n">op_name</span><span class="o">=</span><span class="s2">"//"</span><span class="p">,</span> <span class="n">equiv</span><span class="o">=</span><span class="s2">"other // dataframe"</span><span class="p">,</span> <span class="n">reverse</span><span class="o">=</span><span class="s2">"floordiv"</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Comparison Operators</span> |
| <span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> <span class="c1"># type: ignore[override]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"eq"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__ne__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> <span class="c1"># type: ignore[override]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"ne"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__lt__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"lt"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__le__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"le"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__ge__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"ge"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__gt__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_map_series_op</span><span class="p">(</span><span class="s2">"gt"</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.eq"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.eq.html#pyspark.pandas.DataFrame.eq">[docs]</a> <span class="k">def</span> <span class="nf">eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.eq(1)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> a True True</span> |
| <span class="sd"> b False False</span> |
| <span class="sd"> c False True</span> |
| <span class="sd"> d False False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">==</span> <span class="n">other</span></div> |
| |
| <span class="n">equals</span> <span class="o">=</span> <span class="n">eq</span> |
| |
| <div class="viewcode-block" id="DataFrame.gt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.gt.html#pyspark.pandas.DataFrame.gt">[docs]</a> <span class="k">def</span> <span class="nf">gt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is greater than the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.gt(2)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> a False False</span> |
| <span class="sd"> b False False</span> |
| <span class="sd"> c True False</span> |
| <span class="sd"> d True False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">></span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.ge"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.ge.html#pyspark.pandas.DataFrame.ge">[docs]</a> <span class="k">def</span> <span class="nf">ge</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is greater than or equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.ge(1)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> a True True</span> |
| <span class="sd"> b True False</span> |
| <span class="sd"> c True True</span> |
| <span class="sd"> d True False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">>=</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.lt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.lt.html#pyspark.pandas.DataFrame.lt">[docs]</a> <span class="k">def</span> <span class="nf">lt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is less than the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.lt(1)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> a False False</span> |
| <span class="sd"> b False False</span> |
| <span class="sd"> c False False</span> |
| <span class="sd"> d False False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o"><</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.le"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.le.html#pyspark.pandas.DataFrame.le">[docs]</a> <span class="k">def</span> <span class="nf">le</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is less than or equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.le(2)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> a True True</span> |
| <span class="sd"> b True False</span> |
| <span class="sd"> c False True</span> |
| <span class="sd"> d False False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o"><=</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.ne"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.ne.html#pyspark.pandas.DataFrame.ne">[docs]</a> <span class="k">def</span> <span class="nf">ne</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compare if the current value is not equal to the other.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'b': [1, np.nan, 1, np.nan]},</span> |
| <span class="sd"> ... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.ne(1)</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> a False False</span> |
| <span class="sd"> b True True</span> |
| <span class="sd"> c True False</span> |
| <span class="sd"> d True True</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> <span class="o">!=</span> <span class="n">other</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.applymap"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.applymap.html#pyspark.pandas.DataFrame.applymap">[docs]</a> <span class="k">def</span> <span class="nf">applymap</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Apply a function to a Dataframe elementwise.</span> |
| |
| <span class="sd"> This method applies a function that accepts and returns a scalar</span> |
| <span class="sd"> to every element of a DataFrame.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def square(x) -> np.int32:</span> |
| <span class="sd"> ... return x ** 2</span> |
| |
| <span class="sd"> pandas-on-Spark uses return type hint and does not try to infer the type.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : callable</span> |
| <span class="sd"> Python function, returns a single value from a single value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Transformed DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2.12], [3.356, 4.567]])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 1.000 2.120</span> |
| <span class="sd"> 1 3.356 4.567</span> |
| |
| <span class="sd"> >>> def str_len(x) -> int:</span> |
| <span class="sd"> ... return len(str(x))</span> |
| <span class="sd"> >>> df.applymap(str_len)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 3 4</span> |
| <span class="sd"> 1 5 5</span> |
| |
| <span class="sd"> >>> def power(x) -> float:</span> |
| <span class="sd"> ... return x ** 2</span> |
| <span class="sd"> >>> df.applymap(power)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 1.000000 4.494400</span> |
| <span class="sd"> 1 11.262736 20.857489</span> |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> df.applymap(lambda x: x ** 2)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 1.000000 4.494400</span> |
| <span class="sd"> 1 11.262736 20.857489</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># TODO: We can implement shortcut theoretically since it creates new DataFrame</span> |
| <span class="c1"># anyway and we don't have to worry about operations on different DataFrames.</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">func</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: not all arguments are implemented comparing to pandas' for now.</span> |
| <div class="viewcode-block" id="DataFrame.aggregate"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.aggregate.html#pyspark.pandas.DataFrame.aggregate">[docs]</a> <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""Aggregate using one or more operations over the specified axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : dict or a list</span> |
| <span class="sd"> a dict mapping from column name (string) to</span> |
| <span class="sd"> aggregate functions (list of strings).</span> |
| <span class="sd"> If a list is given, the aggregation is performed against</span> |
| <span class="sd"> all columns.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `agg` is an alias for `aggregate`. Use the alias.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.apply : Invoke function on DataFrame.</span> |
| <span class="sd"> DataFrame.transform : Only perform transforming type operations.</span> |
| <span class="sd"> DataFrame.groupby : Perform operations over groups.</span> |
| <span class="sd"> Series.aggregate : The equivalent function for Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2, 3],</span> |
| <span class="sd"> ... [4, 5, 6],</span> |
| <span class="sd"> ... [7, 8, 9],</span> |
| <span class="sd"> ... [np.nan, np.nan, np.nan]],</span> |
| <span class="sd"> ... columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1.0 2.0 3.0</span> |
| <span class="sd"> 1 4.0 5.0 6.0</span> |
| <span class="sd"> 2 7.0 8.0 9.0</span> |
| <span class="sd"> 3 NaN NaN NaN</span> |
| |
| <span class="sd"> Aggregate these functions over the rows.</span> |
| |
| <span class="sd"> >>> df.agg(['sum', 'min'])[['A', 'B', 'C']].sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> min 1.0 2.0 3.0</span> |
| <span class="sd"> sum 12.0 15.0 18.0</span> |
| |
| <span class="sd"> Different aggregations per column.</span> |
| |
| <span class="sd"> >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})[['A', 'B']].sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> max NaN 8.0</span> |
| <span class="sd"> min 1.0 2.0</span> |
| <span class="sd"> sum 12.0 NaN</span> |
| |
| <span class="sd"> For multi-index columns:</span> |
| |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")])</span> |
| <span class="sd"> >>> df.agg(['sum', 'min'])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index()</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> min 1.0 2.0 3.0</span> |
| <span class="sd"> sum 12.0 15.0 18.0</span> |
| |
| <span class="sd"> >>> aggregated = df.agg({("X", "A") : ['sum', 'min'], ("X", "B") : ['min', 'max']})</span> |
| <span class="sd"> >>> aggregated[[("X", "A"), ("X", "B")]].sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> X</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> max NaN 8.0</span> |
| <span class="sd"> min 1.0 2.0</span> |
| <span class="sd"> sum 12.0 NaN</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">GroupBy</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">all</span><span class="p">((</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">func</span><span class="p">)):</span> |
| <span class="n">func</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">([(</span><span class="n">column</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"If the given function is a list, it "</span> |
| <span class="s2">"should only contains function names as strings."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span> |
| <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="ow">or</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"aggs must be a dict mapping from column name to aggregate "</span> |
| <span class="s2">"functions (string or list of strings)."</span> |
| <span class="p">)</span> |
| |
| <span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.default_index_type"</span><span class="p">,</span> <span class="s2">"distributed"</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">))</span> <span class="c1"># type: DataFrame</span> |
| |
| <span class="c1"># The codes below basically converts:</span> |
| <span class="c1">#</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># sum min min max</span> |
| <span class="c1"># 0 12.0 1.0 2.0 8.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># to:</span> |
| <span class="c1"># A B</span> |
| <span class="c1"># max NaN 8.0</span> |
| <span class="c1"># min 1.0 2.0</span> |
| <span class="c1"># sum 12.0 NaN</span> |
| <span class="c1">#</span> |
| <span class="c1"># Aggregated output is usually pretty much small.</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span><span class="o">.</span><span class="n">stack</span><span class="p">()</span><span class="o">.</span><span class="n">droplevel</span><span class="p">(</span><span class="mi">0</span><span class="p">)[</span><span class="nb">list</span><span class="p">(</span><span class="n">func</span><span class="o">.</span><span class="n">keys</span><span class="p">())]</span></div> |
| |
| <span class="n">agg</span> <span class="o">=</span> <span class="n">aggregate</span> |
| |
| <div class="viewcode-block" id="DataFrame.corr"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.corr.html#pyspark.pandas.DataFrame.corr">[docs]</a> <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"pearson"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute pairwise correlation of columns, excluding NA/null values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> method : {'pearson', 'spearman'}</span> |
| <span class="sd"> * pearson : standard correlation coefficient</span> |
| <span class="sd"> * spearman : Spearman rank correlation</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> y : DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.corr</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],</span> |
| <span class="sd"> ... columns=['dogs', 'cats'])</span> |
| <span class="sd"> >>> df.corr('pearson')</span> |
| <span class="sd"> dogs cats</span> |
| <span class="sd"> dogs 1.000000 -0.851064</span> |
| <span class="sd"> cats -0.851064 1.000000</span> |
| |
| <span class="sd"> >>> df.corr('spearman')</span> |
| <span class="sd"> dogs cats</span> |
| <span class="sd"> dogs 1.000000 -0.948683</span> |
| <span class="sd"> cats -0.948683 1.000000</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There are behavior differences between pandas-on-Spark and pandas.</span> |
| |
| <span class="sd"> * the `method` argument only accepts 'pearson', 'spearman'</span> |
| <span class="sd"> * the data should not contain NaNs. pandas-on-Spark will return an error.</span> |
| <span class="sd"> * pandas-on-Spark doesn't support the following argument(s).</span> |
| |
| <span class="sd"> * `min_periods` argument is not supported</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">)))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.iteritems"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.iteritems.html#pyspark.pandas.DataFrame.iteritems">[docs]</a> <span class="k">def</span> <span class="nf">iteritems</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Iterator over (column name, Series) pairs.</span> |
| |
| <span class="sd"> Iterates over the DataFrame columns, returning a tuple with</span> |
| <span class="sd"> the column name and the content as a Series.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> label : object</span> |
| <span class="sd"> The column names for the DataFrame being iterated over.</span> |
| <span class="sd"> content : Series</span> |
| <span class="sd"> The column entries belonging to each label, as a Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'species': ['bear', 'bear', 'marsupial'],</span> |
| <span class="sd"> ... 'population': [1864, 22000, 80000]},</span> |
| <span class="sd"> ... index=['panda', 'polar', 'koala'],</span> |
| <span class="sd"> ... columns=['species', 'population'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> species population</span> |
| <span class="sd"> panda bear 1864</span> |
| <span class="sd"> polar bear 22000</span> |
| <span class="sd"> koala marsupial 80000</span> |
| |
| <span class="sd"> >>> for label, content in df.iteritems():</span> |
| <span class="sd"> ... print('label:', label)</span> |
| <span class="sd"> ... print('content:', content.to_string())</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> label: species</span> |
| <span class="sd"> content: panda bear</span> |
| <span class="sd"> polar bear</span> |
| <span class="sd"> koala marsupial</span> |
| <span class="sd"> label: population</span> |
| <span class="sd"> content: panda 1864</span> |
| <span class="sd"> polar 22000</span> |
| <span class="sd"> koala 80000</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="p">(</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.iterrows"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.iterrows.html#pyspark.pandas.DataFrame.iterrows">[docs]</a> <span class="k">def</span> <span class="nf">iterrows</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Iterate over DataFrame rows as (index, Series) pairs.</span> |
| |
| <span class="sd"> Yields</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> index : label or tuple of label</span> |
| <span class="sd"> The index of the row. A tuple for a `MultiIndex`.</span> |
| <span class="sd"> data : pandas.Series</span> |
| <span class="sd"> The data of the row as a Series.</span> |
| |
| <span class="sd"> it : generator</span> |
| <span class="sd"> A generator that iterates over the rows of the frame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| |
| <span class="sd"> 1. Because ``iterrows`` returns a Series for each row,</span> |
| <span class="sd"> it does **not** preserve dtypes across the rows (dtypes are</span> |
| <span class="sd"> preserved across columns for DataFrames). For example,</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([[1, 1.5]], columns=['int', 'float'])</span> |
| <span class="sd"> >>> row = next(df.iterrows())[1]</span> |
| <span class="sd"> >>> row</span> |
| <span class="sd"> int 1.0</span> |
| <span class="sd"> float 1.5</span> |
| <span class="sd"> Name: 0, dtype: float64</span> |
| <span class="sd"> >>> print(row['int'].dtype)</span> |
| <span class="sd"> float64</span> |
| <span class="sd"> >>> print(df['int'].dtype)</span> |
| <span class="sd"> int64</span> |
| |
| <span class="sd"> To preserve dtypes while iterating over the rows, it is better</span> |
| <span class="sd"> to use :meth:`itertuples` which returns namedtuples of the values</span> |
| <span class="sd"> and which is generally faster than ``iterrows``.</span> |
| |
| <span class="sd"> 2. You should **never modify** something you are iterating over.</span> |
| <span class="sd"> This is not guaranteed to work in all cases. Depending on the</span> |
| <span class="sd"> data types, the iterator returns a copy and not a view, and writing</span> |
| <span class="sd"> to it will have no effect.</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span> |
| <span class="n">internal_index_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">internal_data_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| |
| <span class="k">def</span> <span class="nf">extract_kv_from_spark_row</span><span class="p">(</span><span class="n">row</span><span class="p">:</span> <span class="n">Row</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">]:</span> |
| <span class="n">k</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">row</span><span class="p">[</span><span class="n">internal_index_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">internal_index_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="k">else</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">internal_index_columns</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">v</span> <span class="o">=</span> <span class="p">[</span><span class="n">row</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">internal_data_columns</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> |
| |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">map</span><span class="p">(</span> |
| <span class="n">extract_kv_from_spark_row</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">toLocalIterator</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="n">s</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="n">k</span><span class="p">)</span> |
| <span class="k">yield</span> <span class="n">k</span><span class="p">,</span> <span class="n">s</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.itertuples"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.itertuples.html#pyspark.pandas.DataFrame.itertuples">[docs]</a> <span class="k">def</span> <span class="nf">itertuples</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"PandasOnSpark"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Iterate over DataFrame rows as namedtuples.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index : bool, default True</span> |
| <span class="sd"> If True, return the index as the first element of the tuple.</span> |
| <span class="sd"> name : str or None, default "PandasOnSpark"</span> |
| <span class="sd"> The name of the returned namedtuples or None to return regular</span> |
| <span class="sd"> tuples.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> iterator</span> |
| <span class="sd"> An object to iterate over namedtuples for each row in the</span> |
| <span class="sd"> DataFrame with the first field possibly being the index and</span> |
| <span class="sd"> following fields being the column values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)</span> |
| <span class="sd"> pairs.</span> |
| <span class="sd"> DataFrame.items : Iterate over (column name, Series) pairs.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The column names will be renamed to positional names if they are</span> |
| <span class="sd"> invalid Python identifiers, repeated, or start with an underscore.</span> |
| <span class="sd"> On python versions < 3.7 regular tuples are returned for DataFrames</span> |
| <span class="sd"> with a large number of columns (>254).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},</span> |
| <span class="sd"> ... index=['dog', 'hawk'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> dog 4 0</span> |
| <span class="sd"> hawk 2 2</span> |
| |
| <span class="sd"> >>> for row in df.itertuples():</span> |
| <span class="sd"> ... print(row)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> PandasOnSpark(Index='dog', num_legs=4, num_wings=0)</span> |
| <span class="sd"> PandasOnSpark(Index='hawk', num_legs=2, num_wings=2)</span> |
| |
| <span class="sd"> By setting the `index` parameter to False we can remove the index</span> |
| <span class="sd"> as the first element of the tuple:</span> |
| |
| <span class="sd"> >>> for row in df.itertuples(index=False):</span> |
| <span class="sd"> ... print(row)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> PandasOnSpark(num_legs=4, num_wings=0)</span> |
| <span class="sd"> PandasOnSpark(num_legs=2, num_wings=2)</span> |
| |
| <span class="sd"> With the `name` parameter set we set a custom name for the yielded</span> |
| <span class="sd"> namedtuples:</span> |
| |
| <span class="sd"> >>> for row in df.itertuples(name='Animal'):</span> |
| <span class="sd"> ... print(row)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> Animal(Index='dog', num_legs=4, num_wings=0)</span> |
| <span class="sd"> Animal(Index='hawk', num_legs=2, num_wings=2)</span> |
| <span class="sd"> """</span> |
| <span class="n">fields</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">index</span><span class="p">:</span> |
| <span class="n">fields</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s2">"Index"</span><span class="p">)</span> |
| |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">data_spark_column_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| |
| <span class="k">def</span> <span class="nf">extract_kv_from_spark_row</span><span class="p">(</span><span class="n">row</span><span class="p">:</span> <span class="n">Row</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">]:</span> |
| <span class="n">k</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">row</span><span class="p">[</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_spark_column_names</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="k">else</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">v</span> <span class="o">=</span> <span class="p">[</span><span class="n">row</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">data_spark_column_names</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> |
| |
| <span class="n">can_return_named_tuples</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">version_info</span> <span class="o">>=</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">7</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="n">index</span> <span class="o"><</span> <span class="mi">255</span> |
| |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">can_return_named_tuples</span><span class="p">:</span> |
| <span class="n">itertuple</span> <span class="o">=</span> <span class="n">namedtuple</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">fields</span><span class="p">,</span> <span class="n">rename</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="c1"># type: ignore</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">map</span><span class="p">(</span> |
| <span class="n">extract_kv_from_spark_row</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">toLocalIterator</span><span class="p">(),</span> |
| <span class="p">):</span> |
| <span class="k">yield</span> <span class="n">itertuple</span><span class="o">.</span><span class="n">_make</span><span class="p">(([</span><span class="n">k</span><span class="p">]</span> <span class="k">if</span> <span class="n">index</span> <span class="k">else</span> <span class="p">[])</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">v</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">map</span><span class="p">(</span> |
| <span class="n">extract_kv_from_spark_row</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">toLocalIterator</span><span class="p">(),</span> |
| <span class="p">):</span> |
| <span class="k">yield</span> <span class="nb">tuple</span><span class="p">(([</span><span class="n">k</span><span class="p">]</span> <span class="k">if</span> <span class="n">index</span> <span class="k">else</span> <span class="p">[])</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">v</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.items"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.items.html#pyspark.pandas.DataFrame.items">[docs]</a> <span class="k">def</span> <span class="nf">items</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]]:</span> |
| <span class="sd">"""This is an alias of ``iteritems``."""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_clipboard"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_clipboard.html#pyspark.pandas.DataFrame.to_clipboard">[docs]</a> <span class="k">def</span> <span class="nf">to_clipboard</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">excel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">sep</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Copy object to the system clipboard.</span> |
| |
| <span class="sd"> Write a text representation of object to the system clipboard.</span> |
| <span class="sd"> This can be pasted into Excel, for example.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting DataFrame is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> excel : bool, default True</span> |
| <span class="sd"> - True, use the provided separator, writing in a csv format for</span> |
| <span class="sd"> allowing easy pasting into excel.</span> |
| <span class="sd"> - False, write a string representation of the object to the</span> |
| <span class="sd"> clipboard.</span> |
| |
| <span class="sd"> sep : str, default ``'\\t'``</span> |
| <span class="sd"> Field delimiter.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> These parameters will be passed to DataFrame.to_csv.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Requirements for your platform.</span> |
| |
| <span class="sd"> - Linux : `xclip`, or `xsel` (with `gtk` or `PyQt4` modules)</span> |
| <span class="sd"> - Windows : none</span> |
| <span class="sd"> - OS X : none</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_clipboard : Read text from clipboard.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Copy the contents of a DataFrame to the clipboard.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) # doctest: +SKIP</span> |
| <span class="sd"> >>> df.to_clipboard(sep=',') # doctest: +SKIP</span> |
| <span class="sd"> ... # Wrote the following to the system clipboard:</span> |
| <span class="sd"> ... # ,A,B,C</span> |
| <span class="sd"> ... # 0,1,2,3</span> |
| <span class="sd"> ... # 1,4,5,6</span> |
| |
| <span class="sd"> We can omit the index by passing the keyword `index` and setting</span> |
| <span class="sd"> it to false.</span> |
| |
| <span class="sd"> >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP</span> |
| <span class="sd"> ... # Wrote the following to the system clipboard:</span> |
| <span class="sd"> ... # A,B,C</span> |
| <span class="sd"> ... # 1,2,3</span> |
| <span class="sd"> ... # 4,5,6</span> |
| |
| <span class="sd"> This function also works for Series:</span> |
| |
| <span class="sd"> >>> df = ps.Series([1, 2, 3, 4, 5, 6, 7], name='x') # doctest: +SKIP</span> |
| <span class="sd"> >>> df.to_clipboard(sep=',') # doctest: +SKIP</span> |
| <span class="sd"> ... # Wrote the following to the system clipboard:</span> |
| <span class="sd"> ... # 0, 1</span> |
| <span class="sd"> ... # 1, 2</span> |
| <span class="sd"> ... # 2, 3</span> |
| <span class="sd"> ... # 3, 4</span> |
| <span class="sd"> ... # 4, 5</span> |
| <span class="sd"> ... # 5, 6</span> |
| <span class="sd"> ... # 6, 7</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_clipboard</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_clipboard</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_html"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_html.html#pyspark.pandas.DataFrame.to_html">[docs]</a> <span class="k">def</span> <span class="nf">to_html</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">col_space</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"NaN"</span><span class="p">,</span> |
| <span class="n">formatters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span> |
| <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]]</span> |
| <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">float</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sparsify</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">justify</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">max_rows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">max_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">show_dimensions</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"."</span><span class="p">,</span> |
| <span class="n">bold_rows</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">classes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">escape</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">notebook</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">border</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">table_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">render_links</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Render a DataFrame as an HTML table.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory. If the input</span> |
| <span class="sd"> is large, set max_rows parameter.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> buf : StringIO-like, optional</span> |
| <span class="sd"> Buffer to write to.</span> |
| <span class="sd"> columns : sequence, optional, default None</span> |
| <span class="sd"> The subset of columns to write. Writes all columns by default.</span> |
| <span class="sd"> col_space : int, optional</span> |
| <span class="sd"> The minimum width of each column.</span> |
| <span class="sd"> header : bool, optional</span> |
| <span class="sd"> Write out the column names. If a list of strings is given, it</span> |
| <span class="sd"> is assumed to be aliases for the column names</span> |
| <span class="sd"> index : bool, optional, default True</span> |
| <span class="sd"> Whether to print index (row) labels.</span> |
| <span class="sd"> na_rep : str, optional, default 'NaN'</span> |
| <span class="sd"> String representation of NAN to use.</span> |
| <span class="sd"> formatters : list or dict of one-param. functions, optional</span> |
| <span class="sd"> Formatter functions to apply to columns' elements by position or</span> |
| <span class="sd"> name.</span> |
| <span class="sd"> The result of each function must be a unicode string.</span> |
| <span class="sd"> List must be of length equal to the number of columns.</span> |
| <span class="sd"> float_format : one-parameter function, optional, default None</span> |
| <span class="sd"> Formatter function to apply to columns' elements if they are</span> |
| <span class="sd"> floats. The result of this function must be a unicode string.</span> |
| <span class="sd"> sparsify : bool, optional, default True</span> |
| <span class="sd"> Set to False for a DataFrame with a hierarchical index to print</span> |
| <span class="sd"> every multiindex key at each row.</span> |
| <span class="sd"> index_names : bool, optional, default True</span> |
| <span class="sd"> Prints the names of the indexes.</span> |
| <span class="sd"> justify : str, default None</span> |
| <span class="sd"> How to justify the column labels. If None uses the option from</span> |
| <span class="sd"> the print configuration (controlled by set_option), 'right' out</span> |
| <span class="sd"> of the box. Valid values are</span> |
| |
| <span class="sd"> * left</span> |
| <span class="sd"> * right</span> |
| <span class="sd"> * center</span> |
| <span class="sd"> * justify</span> |
| <span class="sd"> * justify-all</span> |
| <span class="sd"> * start</span> |
| <span class="sd"> * end</span> |
| <span class="sd"> * inherit</span> |
| <span class="sd"> * match-parent</span> |
| <span class="sd"> * initial</span> |
| <span class="sd"> * unset.</span> |
| <span class="sd"> max_rows : int, optional</span> |
| <span class="sd"> Maximum number of rows to display in the console.</span> |
| <span class="sd"> max_cols : int, optional</span> |
| <span class="sd"> Maximum number of columns to display in the console.</span> |
| <span class="sd"> show_dimensions : bool, default False</span> |
| <span class="sd"> Display DataFrame dimensions (number of rows by number of columns).</span> |
| <span class="sd"> decimal : str, default '.'</span> |
| <span class="sd"> Character recognized as decimal separator, e.g. ',' in Europe.</span> |
| <span class="sd"> bold_rows : bool, default True</span> |
| <span class="sd"> Make the row labels bold in the output.</span> |
| <span class="sd"> classes : str or list or tuple, default None</span> |
| <span class="sd"> CSS class(es) to apply to the resulting html table.</span> |
| <span class="sd"> escape : bool, default True</span> |
| <span class="sd"> Convert the characters <, >, and & to HTML-safe sequences.</span> |
| <span class="sd"> notebook : {True, False}, default False</span> |
| <span class="sd"> Whether the generated HTML is for IPython Notebook.</span> |
| <span class="sd"> border : int</span> |
| <span class="sd"> A ``border=border`` attribute is included in the opening</span> |
| <span class="sd"> `<table>` tag. Default ``pd.options.html.border``.</span> |
| <span class="sd"> table_id : str, optional</span> |
| <span class="sd"> A css id is included in the opening `<table>` tag if specified.</span> |
| <span class="sd"> render_links : bool, default False</span> |
| <span class="sd"> Convert URLs to HTML links (only works with pandas 0.24+).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str (or unicode, depending on data and options)</span> |
| <span class="sd"> String representation of the dataframe.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> to_string : Convert DataFrame to a string.</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">max_rows</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">max_rows</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_html</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_html</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_string"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_string.html#pyspark.pandas.DataFrame.to_string">[docs]</a> <span class="k">def</span> <span class="nf">to_string</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">col_space</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"NaN"</span><span class="p">,</span> |
| <span class="n">formatters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span> |
| <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]]</span> |
| <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">float</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sparsify</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">justify</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">max_rows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">max_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">show_dimensions</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"."</span><span class="p">,</span> |
| <span class="n">line_width</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Render a DataFrame to a console-friendly tabular output.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory. If the input</span> |
| <span class="sd"> is large, set max_rows parameter.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> buf : StringIO-like, optional</span> |
| <span class="sd"> Buffer to write to.</span> |
| <span class="sd"> columns : sequence, optional, default None</span> |
| <span class="sd"> The subset of columns to write. Writes all columns by default.</span> |
| <span class="sd"> col_space : int, optional</span> |
| <span class="sd"> The minimum width of each column.</span> |
| <span class="sd"> header : bool, optional</span> |
| <span class="sd"> Write out the column names. If a list of strings is given, it</span> |
| <span class="sd"> is assumed to be aliases for the column names</span> |
| <span class="sd"> index : bool, optional, default True</span> |
| <span class="sd"> Whether to print index (row) labels.</span> |
| <span class="sd"> na_rep : str, optional, default 'NaN'</span> |
| <span class="sd"> String representation of NAN to use.</span> |
| <span class="sd"> formatters : list or dict of one-param. functions, optional</span> |
| <span class="sd"> Formatter functions to apply to columns' elements by position or</span> |
| <span class="sd"> name.</span> |
| <span class="sd"> The result of each function must be a unicode string.</span> |
| <span class="sd"> List must be of length equal to the number of columns.</span> |
| <span class="sd"> float_format : one-parameter function, optional, default None</span> |
| <span class="sd"> Formatter function to apply to columns' elements if they are</span> |
| <span class="sd"> floats. The result of this function must be a unicode string.</span> |
| <span class="sd"> sparsify : bool, optional, default True</span> |
| <span class="sd"> Set to False for a DataFrame with a hierarchical index to print</span> |
| <span class="sd"> every multiindex key at each row.</span> |
| <span class="sd"> index_names : bool, optional, default True</span> |
| <span class="sd"> Prints the names of the indexes.</span> |
| <span class="sd"> justify : str, default None</span> |
| <span class="sd"> How to justify the column labels. If None uses the option from</span> |
| <span class="sd"> the print configuration (controlled by set_option), 'right' out</span> |
| <span class="sd"> of the box. Valid values are</span> |
| |
| <span class="sd"> * left</span> |
| <span class="sd"> * right</span> |
| <span class="sd"> * center</span> |
| <span class="sd"> * justify</span> |
| <span class="sd"> * justify-all</span> |
| <span class="sd"> * start</span> |
| <span class="sd"> * end</span> |
| <span class="sd"> * inherit</span> |
| <span class="sd"> * match-parent</span> |
| <span class="sd"> * initial</span> |
| <span class="sd"> * unset.</span> |
| <span class="sd"> max_rows : int, optional</span> |
| <span class="sd"> Maximum number of rows to display in the console.</span> |
| <span class="sd"> max_cols : int, optional</span> |
| <span class="sd"> Maximum number of columns to display in the console.</span> |
| <span class="sd"> show_dimensions : bool, default False</span> |
| <span class="sd"> Display DataFrame dimensions (number of rows by number of columns).</span> |
| <span class="sd"> decimal : str, default '.'</span> |
| <span class="sd"> Character recognized as decimal separator, e.g. ',' in Europe.</span> |
| <span class="sd"> line_width : int, optional</span> |
| <span class="sd"> Width to wrap a line in characters.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str (or unicode, depending on data and options)</span> |
| <span class="sd"> String representation of the dataframe.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> to_html : Convert DataFrame to HTML.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}, columns=['col1', 'col2'])</span> |
| <span class="sd"> >>> print(df.to_string())</span> |
| <span class="sd"> col1 col2</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 5</span> |
| <span class="sd"> 2 3 6</span> |
| |
| <span class="sd"> >>> print(df.to_string(max_rows=2))</span> |
| <span class="sd"> col1 col2</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 5</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">max_rows</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">max_rows</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_string</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_string</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_dict"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_dict.html#pyspark.pandas.DataFrame.to_dict">[docs]</a> <span class="k">def</span> <span class="nf">to_dict</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">orient</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"dict"</span><span class="p">,</span> <span class="n">into</span><span class="p">:</span> <span class="n">Type</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">Mapping</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert the DataFrame to a dictionary.</span> |
| |
| <span class="sd"> The type of the key-value pairs can be customized with the parameters</span> |
| <span class="sd"> (see below).</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas DataFrame is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}</span> |
| <span class="sd"> Determines the type of the values of the dictionary.</span> |
| |
| <span class="sd"> - 'dict' (default) : dict like {column -> {index -> value}}</span> |
| <span class="sd"> - 'list' : dict like {column -> [values]}</span> |
| <span class="sd"> - 'series' : dict like {column -> Series(values)}</span> |
| <span class="sd"> - 'split' : dict like</span> |
| <span class="sd"> {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}</span> |
| <span class="sd"> - 'records' : list like</span> |
| <span class="sd"> [{column -> value}, ... , {column -> value}]</span> |
| <span class="sd"> - 'index' : dict like {index -> {column -> value}}</span> |
| |
| <span class="sd"> Abbreviations are allowed. `s` indicates `series` and `sp`</span> |
| <span class="sd"> indicates `split`.</span> |
| |
| <span class="sd"> into : class, default dict</span> |
| <span class="sd"> The collections.abc.Mapping subclass used for all Mappings</span> |
| <span class="sd"> in the return value. Can be the actual class or an empty</span> |
| <span class="sd"> instance of the mapping type you want. If you want a</span> |
| <span class="sd"> collections.defaultdict, you must pass it initialized.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dict, list or collections.abc.Mapping</span> |
| <span class="sd"> Return a collections.abc.Mapping object representing the DataFrame.</span> |
| <span class="sd"> The resulting transformation depends on the `orient` parameter.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'col1': [1, 2],</span> |
| <span class="sd"> ... 'col2': [0.5, 0.75]},</span> |
| <span class="sd"> ... index=['row1', 'row2'],</span> |
| <span class="sd"> ... columns=['col1', 'col2'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> col1 col2</span> |
| <span class="sd"> row1 1 0.50</span> |
| <span class="sd"> row2 2 0.75</span> |
| |
| <span class="sd"> >>> df_dict = df.to_dict()</span> |
| <span class="sd"> >>> sorted([(key, sorted(values.items())) for key, values in df_dict.items()])</span> |
| <span class="sd"> [('col1', [('row1', 1), ('row2', 2)]), ('col2', [('row1', 0.5), ('row2', 0.75)])]</span> |
| |
| <span class="sd"> You can specify the return orientation.</span> |
| |
| <span class="sd"> >>> df_dict = df.to_dict('series')</span> |
| <span class="sd"> >>> sorted(df_dict.items())</span> |
| <span class="sd"> [('col1', row1 1</span> |
| <span class="sd"> row2 2</span> |
| <span class="sd"> Name: col1, dtype: int64), ('col2', row1 0.50</span> |
| <span class="sd"> row2 0.75</span> |
| <span class="sd"> Name: col2, dtype: float64)]</span> |
| |
| <span class="sd"> >>> df_dict = df.to_dict('split')</span> |
| <span class="sd"> >>> sorted(df_dict.items()) # doctest: +ELLIPSIS</span> |
| <span class="sd"> [('columns', ['col1', 'col2']), ('data', [[1..., 0.75]]), ('index', ['row1', 'row2'])]</span> |
| |
| <span class="sd"> >>> df_dict = df.to_dict('records')</span> |
| <span class="sd"> >>> [sorted(values.items()) for values in df_dict] # doctest: +ELLIPSIS</span> |
| <span class="sd"> [[('col1', 1...), ('col2', 0.5)], [('col1', 2...), ('col2', 0.75)]]</span> |
| |
| <span class="sd"> >>> df_dict = df.to_dict('index')</span> |
| <span class="sd"> >>> sorted([(key, sorted(values.items())) for key, values in df_dict.items()])</span> |
| <span class="sd"> [('row1', [('col1', 1), ('col2', 0.5)]), ('row2', [('col1', 2), ('col2', 0.75)])]</span> |
| |
| <span class="sd"> You can also specify the mapping type.</span> |
| |
| <span class="sd"> >>> from collections import OrderedDict, defaultdict</span> |
| <span class="sd"> >>> df.to_dict(into=OrderedDict)</span> |
| <span class="sd"> OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), \</span> |
| <span class="sd">('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])</span> |
| |
| <span class="sd"> If you want a `defaultdict`, you need to initialize it:</span> |
| |
| <span class="sd"> >>> dd = defaultdict(list)</span> |
| <span class="sd"> >>> df.to_dict('records', into=dd) # doctest: +ELLIPSIS</span> |
| <span class="sd"> [defaultdict(<class 'list'>, {'col..., 'col...}), \</span> |
| <span class="sd">defaultdict(<class 'list'>, {'col..., 'col...})]</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Make sure locals() call is at the top of the function so we don't capture local variables.</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_dict</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_dict</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_latex"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_latex.html#pyspark.pandas.DataFrame.to_latex">[docs]</a> <span class="k">def</span> <span class="nf">to_latex</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">col_space</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"NaN"</span><span class="p">,</span> |
| <span class="n">formatters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span> |
| <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]]</span> |
| <span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="nb">float</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">sparsify</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">bold_rows</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">column_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">longtable</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">escape</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"."</span><span class="p">,</span> |
| <span class="n">multicolumn</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">multicolumn_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">multirow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="sa">r</span><span class="sd">"""</span> |
| <span class="sd"> Render an object to a LaTeX tabular environment table.</span> |
| |
| <span class="sd"> Render an object to a tabular environment table. You can splice this into a LaTeX</span> |
| <span class="sd"> document. Requires usepackage{booktabs}.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory. If the input</span> |
| <span class="sd"> is large, consider alternative formats.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> buf : file descriptor or None</span> |
| <span class="sd"> Buffer to write to. If None, the output is returned as a string.</span> |
| <span class="sd"> columns : list of label, optional</span> |
| <span class="sd"> The subset of columns to write. Writes all columns by default.</span> |
| <span class="sd"> col_space : int, optional</span> |
| <span class="sd"> The minimum width of each column.</span> |
| <span class="sd"> header : bool or list of str, default True</span> |
| <span class="sd"> Write out the column names. If a list of strings is given, it is assumed to be aliases</span> |
| <span class="sd"> for the column names.</span> |
| <span class="sd"> index : bool, default True</span> |
| <span class="sd"> Write row names (index).</span> |
| <span class="sd"> na_rep : str, default ‘NaN’</span> |
| <span class="sd"> Missing data representation.</span> |
| <span class="sd"> formatters : list of functions or dict of {str: function}, optional</span> |
| <span class="sd"> Formatter functions to apply to columns’ elements by position or name. The result of</span> |
| <span class="sd"> each function must be a unicode string. List must be of length equal to the number of</span> |
| <span class="sd"> columns.</span> |
| <span class="sd"> float_format : str, optional</span> |
| <span class="sd"> Format string for floating point numbers.</span> |
| <span class="sd"> sparsify : bool, optional</span> |
| <span class="sd"> Set to False for a DataFrame with a hierarchical index to print every multiindex key at</span> |
| <span class="sd"> each row. By default, the value will be read from the config module.</span> |
| <span class="sd"> index_names : bool, default True</span> |
| <span class="sd"> Prints the names of the indexes.</span> |
| <span class="sd"> bold_rows : bool, default False</span> |
| <span class="sd"> Make the row labels bold in the output.</span> |
| <span class="sd"> column_format : str, optional</span> |
| <span class="sd"> The columns format as specified in LaTeX table format e.g. ‘rcl’ for 3 columns. By</span> |
| <span class="sd"> default, ‘l’ will be used for all columns except columns of numbers, which default</span> |
| <span class="sd"> to ‘r’.</span> |
| <span class="sd"> longtable : bool, optional</span> |
| <span class="sd"> By default, the value will be read from the pandas config module. Use a longtable</span> |
| <span class="sd"> environment instead of tabular. Requires adding a usepackage{longtable} to your LaTeX</span> |
| <span class="sd"> preamble.</span> |
| <span class="sd"> escape : bool, optional</span> |
| <span class="sd"> By default, the value will be read from the pandas config module. When set to False</span> |
| <span class="sd"> prevents from escaping latex special characters in column names.</span> |
| <span class="sd"> encoding : str, optional</span> |
| <span class="sd"> A string representing the encoding to use in the output file, defaults to ‘ascii’ on</span> |
| <span class="sd"> Python 2 and ‘utf-8’ on Python 3.</span> |
| <span class="sd"> decimal : str, default ‘.’</span> |
| <span class="sd"> Character recognized as decimal separator, e.g. ‘,’ in Europe.</span> |
| <span class="sd"> multicolumn : bool, default True</span> |
| <span class="sd"> Use multicolumn to enhance MultiIndex columns. The default will be read from the config</span> |
| <span class="sd"> module.</span> |
| <span class="sd"> multicolumn_format : str, default ‘l’</span> |
| <span class="sd"> The alignment for multicolumns, similar to column_format The default will be read from</span> |
| <span class="sd"> the config module.</span> |
| <span class="sd"> multirow : bool, default False</span> |
| <span class="sd"> Use multirow to enhance MultiIndex rows. Requires adding a usepackage{multirow} to your</span> |
| <span class="sd"> LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained</span> |
| <span class="sd"> rows, separating groups via clines. The default will be read from the pandas config</span> |
| <span class="sd"> module.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str or None</span> |
| <span class="sd"> If buf is None, returns the resulting LateX format as a string. Otherwise returns None.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.to_string : Render a DataFrame to a console-friendly</span> |
| <span class="sd"> tabular output.</span> |
| <span class="sd"> DataFrame.to_html : Render a DataFrame as an HTML table.</span> |
| |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'name': ['Raphael', 'Donatello'],</span> |
| <span class="sd"> ... 'mask': ['red', 'purple'],</span> |
| <span class="sd"> ... 'weapon': ['sai', 'bo staff']},</span> |
| <span class="sd"> ... columns=['name', 'mask', 'weapon'])</span> |
| <span class="sd"> >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> \begin{tabular}{lll}</span> |
| <span class="sd"> \toprule</span> |
| <span class="sd"> name & mask & weapon \\</span> |
| <span class="sd"> \midrule</span> |
| <span class="sd"> Raphael & red & sai \\</span> |
| <span class="sd"> Donatello & purple & bo staff \\</span> |
| <span class="sd"> \bottomrule</span> |
| <span class="sd"> \end{tabular}</span> |
| <span class="sd"> <BLANKLINE></span> |
| <span class="sd"> """</span> |
| |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_latex</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_latex</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: enable doctests once we drop Spark 2.3.x (due to type coercion logic</span> |
| <span class="c1"># when creating arrays)</span> |
| <div class="viewcode-block" id="DataFrame.transpose"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.transpose.html#pyspark.pandas.DataFrame.transpose">[docs]</a> <span class="k">def</span> <span class="nf">transpose</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Transpose index and columns.</span> |
| |
| <span class="sd"> Reflect the DataFrame over its main diagonal by writing rows as columns</span> |
| <span class="sd"> and vice-versa. The property :attr:`.T` is an accessor to the method</span> |
| <span class="sd"> :meth:`transpose`.</span> |
| |
| <span class="sd"> .. note:: This method is based on an expensive operation due to the nature</span> |
| <span class="sd"> of big data. Internally it needs to generate each row for each value, and</span> |
| <span class="sd"> then group twice - it is a huge operation. To prevent misusage, this method</span> |
| <span class="sd"> has the 'compute.max_rows' default limit of input length, and raises a ValueError.</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import option_context</span> |
| <span class="sd"> >>> with option_context('compute.max_rows', 1000): # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> ... ps.DataFrame({'a': range(1001)}).transpose()</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: Current DataFrame has more then the given limit 1000 rows.</span> |
| <span class="sd"> Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option'</span> |
| <span class="sd"> to retrieve to retrieve more than 1000 rows. Note that, before changing the</span> |
| <span class="sd"> 'compute.max_rows', this operation is considerably expensive.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> The transposed DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Transposing a DataFrame with mixed dtypes will result in a homogeneous</span> |
| <span class="sd"> DataFrame with the coerced dtype. For instance, if int and float have</span> |
| <span class="sd"> to be placed in same column, it becomes float. If type coercion is not</span> |
| <span class="sd"> possible, it fails.</span> |
| |
| <span class="sd"> Also, note that the values in index should be unique because they become</span> |
| <span class="sd"> unique column names.</span> |
| |
| <span class="sd"> In addition, if Spark 2.3 is used, the types should always be exactly same.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> **Square DataFrame with homogeneous dtype**</span> |
| |
| <span class="sd"> >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}</span> |
| <span class="sd"> >>> df1 = ps.DataFrame(data=d1, columns=['col1', 'col2'])</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> col1 col2</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| |
| <span class="sd"> >>> df1_transposed = df1.T.sort_index() # doctest: +SKIP</span> |
| <span class="sd"> >>> df1_transposed # doctest: +SKIP</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> col1 1 2</span> |
| <span class="sd"> col2 3 4</span> |
| |
| <span class="sd"> When the dtype is homogeneous in the original DataFrame, we get a</span> |
| <span class="sd"> transposed DataFrame with the same dtype:</span> |
| |
| <span class="sd"> >>> df1.dtypes</span> |
| <span class="sd"> col1 int64</span> |
| <span class="sd"> col2 int64</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> >>> df1_transposed.dtypes # doctest: +SKIP</span> |
| <span class="sd"> 0 int64</span> |
| <span class="sd"> 1 int64</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> **Non-square DataFrame with mixed dtypes**</span> |
| |
| <span class="sd"> >>> d2 = {'score': [9.5, 8],</span> |
| <span class="sd"> ... 'kids': [0, 0],</span> |
| <span class="sd"> ... 'age': [12, 22]}</span> |
| <span class="sd"> >>> df2 = ps.DataFrame(data=d2, columns=['score', 'kids', 'age'])</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> score kids age</span> |
| <span class="sd"> 0 9.5 0 12</span> |
| <span class="sd"> 1 8.0 0 22</span> |
| |
| <span class="sd"> >>> df2_transposed = df2.T.sort_index() # doctest: +SKIP</span> |
| <span class="sd"> >>> df2_transposed # doctest: +SKIP</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> age 12.0 22.0</span> |
| <span class="sd"> kids 0.0 0.0</span> |
| <span class="sd"> score 9.5 8.0</span> |
| |
| <span class="sd"> When the DataFrame has mixed dtypes, we get a transposed DataFrame with</span> |
| <span class="sd"> the coerced dtype:</span> |
| |
| <span class="sd"> >>> df2.dtypes</span> |
| <span class="sd"> score float64</span> |
| <span class="sd"> kids int64</span> |
| <span class="sd"> age int64</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> df2_transposed.dtypes # doctest: +SKIP</span> |
| <span class="sd"> 0 float64</span> |
| <span class="sd"> 1 float64</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="n">max_compute_count</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.max_rows"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">max_compute_count</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">max_compute_count</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">></span> <span class="n">max_compute_count</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Current DataFrame has more then the given limit </span><span class="si">{0}</span><span class="s2"> rows. "</span> |
| <span class="s2">"Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' "</span> |
| <span class="s2">"to retrieve to retrieve more than </span><span class="si">{0}</span><span class="s2"> rows. Note that, before changing the "</span> |
| <span class="s2">"'compute.max_rows', this operation is considerably expensive."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">max_compute_count</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">transpose</span><span class="p">())</span> |
| |
| <span class="c1"># Explode the data to be pairs.</span> |
| <span class="c1">#</span> |
| <span class="c1"># For instance, if the current input DataFrame is as below:</span> |
| <span class="c1">#</span> |
| <span class="c1"># +------+------+------+------+------+</span> |
| <span class="c1"># |index1|index2|(a,x1)|(a,x2)|(b,x3)|</span> |
| <span class="c1"># +------+------+------+------+------+</span> |
| <span class="c1"># | y1| z1| 1| 0| 0|</span> |
| <span class="c1"># | y2| z2| 0| 50| 0|</span> |
| <span class="c1"># | y3| z3| 3| 2| 1|</span> |
| <span class="c1"># +------+------+------+------+------+</span> |
| <span class="c1">#</span> |
| <span class="c1"># Output of `exploded_df` becomes as below:</span> |
| <span class="c1">#</span> |
| <span class="c1"># +-----------------+-----------------+-----------------+-----+</span> |
| <span class="c1"># | index|__index_level_0__|__index_level_1__|value|</span> |
| <span class="c1"># +-----------------+-----------------+-----------------+-----+</span> |
| <span class="c1"># |{"a":["y1","z1"]}| a| x1| 1|</span> |
| <span class="c1"># |{"a":["y1","z1"]}| a| x2| 0|</span> |
| <span class="c1"># |{"a":["y1","z1"]}| b| x3| 0|</span> |
| <span class="c1"># |{"a":["y2","z2"]}| a| x1| 0|</span> |
| <span class="c1"># |{"a":["y2","z2"]}| a| x2| 50|</span> |
| <span class="c1"># |{"a":["y2","z2"]}| b| x3| 0|</span> |
| <span class="c1"># |{"a":["y3","z3"]}| a| x1| 3|</span> |
| <span class="c1"># |{"a":["y3","z3"]}| a| x2| 2|</span> |
| <span class="c1"># |{"a":["y3","z3"]}| b| x3| 1|</span> |
| <span class="c1"># +-----------------+-----------------+-----------------+-----+</span> |
| <span class="n">pairs</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="o">*</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"value"</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">exploded_df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"pairs"</span><span class="p">,</span> <span class="n">pairs</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">scol</span> <span class="k">for</span> <span class="n">scol</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">])</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"a"</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"index"</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"pairs.*"</span><span class="p">),</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># After that, executes pivot with key and its index column.</span> |
| <span class="c1"># Note that index column should contain unique values since column names</span> |
| <span class="c1"># should be unique.</span> |
| <span class="n">internal_index_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">pivoted_df</span> <span class="o">=</span> <span class="n">exploded_df</span><span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="n">internal_index_columns</span><span class="p">)</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="s2">"index"</span><span class="p">)</span> |
| |
| <span class="n">transposed_df</span> <span class="o">=</span> <span class="n">pivoted_df</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"value"</span><span class="p">)))</span> |
| |
| <span class="n">new_data_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> |
| <span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">internal_index_columns</span><span class="p">,</span> <span class="n">transposed_df</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="kc">None</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">col</span><span class="p">)[</span><span class="s2">"a"</span><span class="p">])</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">new_data_columns</span><span class="p">)</span> |
| <span class="p">]</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">transposed_df</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">transposed_df</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal_index_columns</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">transposed_df</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">new_data_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <span class="n">T</span> <span class="o">=</span> <span class="nb">property</span><span class="p">(</span><span class="n">transpose</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.apply"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.apply.html#pyspark.pandas.DataFrame.apply">[docs]</a> <span class="k">def</span> <span class="nf">apply</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> <span class="o">**</span><span class="n">kwds</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="s2">"DataFrame"</span><span class="p">,</span> <span class="s2">"Index"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Apply a function along an axis of the DataFrame.</span> |
| |
| <span class="sd"> Objects passed to the function are Series objects whose index is</span> |
| <span class="sd"> either the DataFrame's index (``axis=0``) or the DataFrame's columns</span> |
| <span class="sd"> (``axis=1``).</span> |
| |
| <span class="sd"> See also `Transform and apply a function</span> |
| <span class="sd"> <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.</span> |
| |
| <span class="sd"> .. note:: when `axis` is 0 or 'index', the `func` is unable to access</span> |
| <span class="sd"> to the whole input series. pandas-on-Spark internally splits the input series into</span> |
| <span class="sd"> multiple batches and calls `func` with each batch multiple times. Therefore, operations</span> |
| <span class="sd"> such as global aggregations are impossible. See the example below.</span> |
| |
| <span class="sd"> >>> # This case does not return the length of whole series but of the batch internally</span> |
| <span class="sd"> ... # used.</span> |
| <span class="sd"> ... def length(s) -> int:</span> |
| <span class="sd"> ... return len(s)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': range(1000)})</span> |
| <span class="sd"> >>> df.apply(length, axis=0) # doctest: +SKIP</span> |
| <span class="sd"> 0 83</span> |
| <span class="sd"> 1 83</span> |
| <span class="sd"> 2 83</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> 10 83</span> |
| <span class="sd"> 11 83</span> |
| <span class="sd"> dtype: int32</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify the return type as `Series` or scalar value in ``func``,</span> |
| <span class="sd"> for instance, as below:</span> |
| |
| <span class="sd"> >>> def square(s) -> ps.Series[np.int32]:</span> |
| <span class="sd"> ... return s ** 2</span> |
| |
| <span class="sd"> pandas-on-Spark uses return type hint and does not try to infer the type.</span> |
| |
| <span class="sd"> In case when axis is 1, it requires to specify `DataFrame` or scalar value</span> |
| <span class="sd"> with type hints as below:</span> |
| |
| <span class="sd"> >>> def plus_one(x) -> ps.DataFrame[float, float]:</span> |
| <span class="sd"> ... return x + 1</span> |
| |
| <span class="sd"> If the return type is specified as `DataFrame`, the output column names become</span> |
| <span class="sd"> `c0, c1, c2 ... cn`. These names are positionally mapped to the returned</span> |
| <span class="sd"> DataFrame in ``func``.</span> |
| |
| <span class="sd"> To specify the column names, you can assign them in a pandas friendly style as below:</span> |
| |
| <span class="sd"> >>> def plus_one(x) -> ps.DataFrame["a": float, "b": float]:</span> |
| <span class="sd"> ... return x + 1</span> |
| |
| <span class="sd"> >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})</span> |
| <span class="sd"> >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]:</span> |
| <span class="sd"> ... return x + 1</span> |
| |
| <span class="sd"> However, this way switches the index type to default index type in the output</span> |
| <span class="sd"> because the type hint cannot express the index type at this moment. Use</span> |
| <span class="sd"> `reset_index()` to keep index as a workaround.</span> |
| |
| <span class="sd"> When the given function has the return type annotated, the original index of the</span> |
| <span class="sd"> DataFrame will be lost and then a default index will be attached to the result.</span> |
| <span class="sd"> Please be careful about configuring the default index. See also `Default Index Type</span> |
| <span class="sd"> <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> Function to apply to each column or row.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| <span class="sd"> Axis along which the function is applied:</span> |
| |
| <span class="sd"> * 0 or 'index': apply function to each column.</span> |
| <span class="sd"> * 1 or 'columns': apply function to each row.</span> |
| <span class="sd"> args : tuple</span> |
| <span class="sd"> Positional arguments to pass to `func` in addition to the</span> |
| <span class="sd"> array/series.</span> |
| <span class="sd"> **kwds</span> |
| <span class="sd"> Additional keyword arguments to pass as keywords arguments to</span> |
| <span class="sd"> `func`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> Result of applying ``func`` along the given axis of the</span> |
| <span class="sd"> DataFrame.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.applymap : For elementwise operations.</span> |
| <span class="sd"> DataFrame.aggregate : Only perform aggregating type operations.</span> |
| <span class="sd"> DataFrame.transform : Only perform transforming type operations.</span> |
| <span class="sd"> Series.apply : The equivalent function for Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[4, 9]] * 3, columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 4 9</span> |
| <span class="sd"> 1 4 9</span> |
| <span class="sd"> 2 4 9</span> |
| |
| <span class="sd"> Using a numpy universal function (in this case the same as</span> |
| <span class="sd"> ``np.sqrt(df)``):</span> |
| |
| <span class="sd"> >>> def sqrt(x) -> ps.Series[float]:</span> |
| <span class="sd"> ... return np.sqrt(x)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.apply(sqrt, axis=0)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 3.0</span> |
| <span class="sd"> 1 2.0 3.0</span> |
| <span class="sd"> 2 2.0 3.0</span> |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> df.apply(np.sqrt, axis=0)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 2.0 3.0</span> |
| <span class="sd"> 1 2.0 3.0</span> |
| <span class="sd"> 2 2.0 3.0</span> |
| |
| <span class="sd"> When `axis` is 1 or 'columns', it applies the function for each row.</span> |
| |
| <span class="sd"> >>> def summation(x) -> np.int64:</span> |
| <span class="sd"> ... return np.sum(x)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.apply(summation, axis=1)</span> |
| <span class="sd"> 0 13</span> |
| <span class="sd"> 1 13</span> |
| <span class="sd"> 2 13</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Likewise, you can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> df.apply(np.sum, axis=1)</span> |
| <span class="sd"> 0 13</span> |
| <span class="sd"> 1 13</span> |
| <span class="sd"> 2 13</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> df.apply(max, axis=1)</span> |
| <span class="sd"> 0 9</span> |
| <span class="sd"> 1 9</span> |
| <span class="sd"> 2 9</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Returning a list-like will result in a Series</span> |
| |
| <span class="sd"> >>> df.apply(lambda x: [1, 2], axis=1)</span> |
| <span class="sd"> 0 [1, 2]</span> |
| <span class="sd"> 1 [1, 2]</span> |
| <span class="sd"> 2 [1, 2]</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> In order to specify the types when `axis` is '1', it should use DataFrame[...]</span> |
| <span class="sd"> annotation. In this case, the column names are automatically generated.</span> |
| |
| <span class="sd"> >>> def identify(x) -> ps.DataFrame['A': np.int64, 'B': np.int64]:</span> |
| <span class="sd"> ... return x</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.apply(identify, axis=1)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 4 9</span> |
| <span class="sd"> 1 4 9</span> |
| <span class="sd"> 2 4 9</span> |
| |
| <span class="sd"> You can also specify extra arguments.</span> |
| |
| <span class="sd"> >>> def plus_two(a, b, c) -> ps.DataFrame[np.int64, np.int64]:</span> |
| <span class="sd"> ... return a + b + c</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.apply(plus_two, axis=1, args=(1,), c=3)</span> |
| <span class="sd"> c0 c1</span> |
| <span class="sd"> 0 8 13</span> |
| <span class="sd"> 1 8 13</span> |
| <span class="sd"> 2 8 13</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">GroupBy</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">types</span><span class="o">.</span><span class="n">FunctionType</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">),</span> <span class="s2">"the first argument should be a callable function."</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">func</span> |
| <span class="n">func</span> <span class="o">=</span> <span class="k">lambda</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">f</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="k">def</span> <span class="nf">apply_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">pdf_or_pser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pdf_or_pser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf_or_pser</span> |
| |
| <span class="n">self_applied</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">)</span> <span class="c1"># type: "DataFrame"</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="kc">None</span> <span class="c1"># type: Optional[List[Label]]</span> |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">self_applied</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwds</span><span class="p">)</span> |
| <span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">applied</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser_or_psdf</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">]</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">]</span> |
| |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">index_fields</span> <span class="o">+</span> <span class="n">data_fields</span><span class="p">])</span> |
| |
| <span class="n">output_func</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_make_pandas_df_builder_func</span><span class="p">(</span> |
| <span class="n">self_applied</span><span class="p">,</span> <span class="n">apply_func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">,</span> <span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">self_applied</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">to_internal_spark_frame</span><span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">iterator</span><span class="p">:</span> <span class="nb">map</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">iterator</span><span class="p">),</span> <span class="n">schema</span><span class="o">=</span><span class="n">return_schema</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># If schema is inferred, we can restore indexes too.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">require_index_axis</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">)</span> |
| <span class="n">require_column_axis</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">require_index_axis</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"The given function should specify a scalar or a series as its type "</span> |
| <span class="s2">"hints when axis is 0 or 'index'; however, the return type "</span> |
| <span class="s2">"was </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">return_sig</span> |
| <span class="p">)</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">self_applied</span><span class="o">.</span><span class="n">columns</span> |
| <span class="p">]</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span> |
| <span class="k">elif</span> <span class="n">require_column_axis</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"The given function should specify a scalar or a frame as its type "</span> |
| <span class="s2">"hints when axis is 1 or 'column'; however, the return type "</span> |
| <span class="s2">"was </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">return_sig</span> |
| <span class="p">)</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrameType</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">fields</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrameType</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># any axis is fine.</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">ScalarType</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">ScalarType</span><span class="p">,</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> |
| <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">]</span> |
| |
| <span class="n">output_func</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_make_pandas_df_builder_func</span><span class="p">(</span> |
| <span class="n">self_applied</span><span class="p">,</span> <span class="n">apply_func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">,</span> <span class="n">retain_index</span><span class="o">=</span><span class="kc">False</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">self_applied</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">to_internal_spark_frame</span><span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">iterator</span><span class="p">:</span> <span class="nb">map</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">iterator</span><span class="p">),</span> <span class="n">schema</span><span class="o">=</span><span class="n">return_schema</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Otherwise, it loses index.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">result</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> <span class="c1"># type: "DataFrame"</span> |
| <span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">result</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.transform"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.transform.html#pyspark.pandas.DataFrame.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">],</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Call ``func`` on self producing a Series with transformed values</span> |
| <span class="sd"> and that has the same length as its input.</span> |
| |
| <span class="sd"> See also `Transform and apply a function</span> |
| <span class="sd"> <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def square(x) -> ps.Series[np.int32]:</span> |
| <span class="sd"> ... return x ** 2</span> |
| |
| <span class="sd"> pandas-on-Spark uses return type hint and does not try to infer the type.</span> |
| |
| <span class="sd"> .. note:: the series within ``func`` is actually multiple pandas series as the</span> |
| <span class="sd"> segments of the whole pandas-on-Spark series; therefore, the length of each series</span> |
| <span class="sd"> is not guaranteed. As an example, an aggregation against each series</span> |
| <span class="sd"> does work as a global aggregation but an aggregation of each segment. See</span> |
| <span class="sd"> below:</span> |
| |
| <span class="sd"> >>> def func(x) -> ps.Series[np.int32]:</span> |
| <span class="sd"> ... return x + sum(x)</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> Function to use for transforming the data. It must work when pandas Series</span> |
| <span class="sd"> is passed.</span> |
| <span class="sd"> axis : int, default 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to func.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to func.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A DataFrame that must have the same length as self.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> Exception : If the returned DataFrame has a different length than self.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.aggregate : Only perform aggregating type operations.</span> |
| <span class="sd"> DataFrame.apply : Invoke function on DataFrame.</span> |
| <span class="sd"> Series.transform : The equivalent function for Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': range(3), 'B': range(1, 4)}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 1</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 2 3</span> |
| |
| <span class="sd"> >>> def square(x) -> ps.Series[np.int32]:</span> |
| <span class="sd"> ... return x ** 2</span> |
| <span class="sd"> >>> df.transform(square)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 1</span> |
| <span class="sd"> 1 1 4</span> |
| <span class="sd"> 2 4 9</span> |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> df.transform(lambda x: x ** 2)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 1</span> |
| <span class="sd"> 1 1 4</span> |
| <span class="sd"> 2 4 9</span> |
| |
| <span class="sd"> For multi-index columns:</span> |
| |
| <span class="sd"> >>> df.columns = [('X', 'A'), ('X', 'B')]</span> |
| <span class="sd"> >>> df.transform(square) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> X</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 1</span> |
| <span class="sd"> 1 1 4</span> |
| <span class="sd"> 2 4 9</span> |
| |
| <span class="sd"> >>> (df * -1).transform(abs) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> X</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 1</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 2 3</span> |
| |
| <span class="sd"> You can also specify extra arguments.</span> |
| |
| <span class="sd"> >>> def calculation(x, y, z) -> ps.Series[int]:</span> |
| <span class="sd"> ... return x ** y + z</span> |
| <span class="sd"> >>> df.transform(calculation, y=10, z=20) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> X</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 20 21</span> |
| <span class="sd"> 1 21 1044</span> |
| <span class="sd"> 2 1044 59069</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">types</span><span class="o">.</span><span class="n">FunctionType</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">),</span> <span class="s2">"the first argument should be a callable function."</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">func</span> |
| <span class="n">func</span> <span class="o">=</span> <span class="k">lambda</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">f</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">transformed</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">axis</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">transformed</span><span class="p">)</span> <span class="c1"># type: "DataFrame"</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">input_label</span><span class="p">,</span> <span class="n">output_label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">input_label</span><span class="p">)</span> |
| |
| <span class="n">field</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">field_for</span><span class="p">(</span><span class="n">output_label</span><span class="p">)</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> |
| <span class="n">data_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> |
| |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">_transform_batch</span><span class="p">(</span> |
| <span class="n">func</span><span class="o">=</span><span class="k">lambda</span> <span class="n">c</span><span class="p">:</span> <span class="n">func</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">),</span> |
| <span class="n">return_type</span><span class="o">=</span><span class="n">SeriesType</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.pop"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.pop.html#pyspark.pandas.DataFrame.pop">[docs]</a> <span class="k">def</span> <span class="nf">pop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Name</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return item and drop from frame. Raise KeyError if not found.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> item : str</span> |
| <span class="sd"> Label of column to be popped.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([('falcon', 'bird', 389.0),</span> |
| <span class="sd"> ... ('parrot', 'bird', 24.0),</span> |
| <span class="sd"> ... ('lion', 'mammal', 80.5),</span> |
| <span class="sd"> ... ('monkey','mammal', np.nan)],</span> |
| <span class="sd"> ... columns=('name', 'class', 'max_speed'))</span> |
| |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 1 parrot bird 24.0</span> |
| <span class="sd"> 2 lion mammal 80.5</span> |
| <span class="sd"> 3 monkey mammal NaN</span> |
| |
| <span class="sd"> >>> df.pop('class')</span> |
| <span class="sd"> 0 bird</span> |
| <span class="sd"> 1 bird</span> |
| <span class="sd"> 2 mammal</span> |
| <span class="sd"> 3 mammal</span> |
| <span class="sd"> Name: class, dtype: object</span> |
| |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name max_speed</span> |
| <span class="sd"> 0 falcon 389.0</span> |
| <span class="sd"> 1 parrot 24.0</span> |
| <span class="sd"> 2 lion 80.5</span> |
| <span class="sd"> 3 monkey NaN</span> |
| |
| <span class="sd"> Also support for MultiIndex</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([('falcon', 'bird', 389.0),</span> |
| <span class="sd"> ... ('parrot', 'bird', 24.0),</span> |
| <span class="sd"> ... ('lion', 'mammal', 80.5),</span> |
| <span class="sd"> ... ('monkey','mammal', np.nan)],</span> |
| <span class="sd"> ... columns=('name', 'class', 'max_speed'))</span> |
| <span class="sd"> >>> columns = [('a', 'name'), ('a', 'class'), ('b', 'max_speed')]</span> |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples(columns)</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 1 parrot bird 24.0</span> |
| <span class="sd"> 2 lion mammal 80.5</span> |
| <span class="sd"> 3 monkey mammal NaN</span> |
| |
| <span class="sd"> >>> df.pop('a')</span> |
| <span class="sd"> name class</span> |
| <span class="sd"> 0 falcon bird</span> |
| <span class="sd"> 1 parrot bird</span> |
| <span class="sd"> 2 lion mammal</span> |
| <span class="sd"> 3 monkey mammal</span> |
| |
| <span class="sd"> >>> df</span> |
| <span class="sd"> b</span> |
| <span class="sd"> max_speed</span> |
| <span class="sd"> 0 389.0</span> |
| <span class="sd"> 1 24.0</span> |
| <span class="sd"> 2 80.5</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> """</span> |
| <span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="p">[</span><span class="n">item</span><span class="p">]</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">item</span><span class="p">)</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">result</span></div> |
| |
| <span class="c1"># TODO: add axis parameter can work when '1' or 'columns'</span> |
| <div class="viewcode-block" id="DataFrame.xs"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.xs.html#pyspark.pandas.DataFrame.xs">[docs]</a> <span class="k">def</span> <span class="nf">xs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Name</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameOrSeries</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return cross-section from the DataFrame.</span> |
| |
| <span class="sd"> This method takes a `key` argument to select data at a particular</span> |
| <span class="sd"> level of a MultiIndex.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> key : label or tuple of label</span> |
| <span class="sd"> Label contained in the index, or partially in a MultiIndex.</span> |
| <span class="sd"> axis : 0 or 'index', default 0</span> |
| <span class="sd"> Axis to retrieve cross-section on.</span> |
| <span class="sd"> currently only support 0 or 'index'</span> |
| <span class="sd"> level : object, defaults to first n levels (n=1 or len(key))</span> |
| <span class="sd"> In case of a key partially contained in a MultiIndex, indicate</span> |
| <span class="sd"> which levels are used. Levels can be referred by label or position.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| <span class="sd"> Cross-section from the original DataFrame</span> |
| <span class="sd"> corresponding to the selected index levels.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.loc : Access a group of rows and columns</span> |
| <span class="sd"> by label(s) or a boolean array.</span> |
| <span class="sd"> DataFrame.iloc : Purely integer-location based indexing</span> |
| <span class="sd"> for selection by position.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> d = {'num_legs': [4, 4, 2, 2],</span> |
| <span class="sd"> ... 'num_wings': [0, 0, 2, 2],</span> |
| <span class="sd"> ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],</span> |
| <span class="sd"> ... 'animal': ['cat', 'dog', 'bat', 'penguin'],</span> |
| <span class="sd"> ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}</span> |
| <span class="sd"> >>> df = ps.DataFrame(data=d)</span> |
| <span class="sd"> >>> df = df.set_index(['class', 'animal', 'locomotion'])</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> class animal locomotion</span> |
| <span class="sd"> mammal cat walks 4 0</span> |
| <span class="sd"> dog walks 4 0</span> |
| <span class="sd"> bat flies 2 2</span> |
| <span class="sd"> bird penguin walks 2 2</span> |
| |
| <span class="sd"> Get values at specified index</span> |
| |
| <span class="sd"> >>> df.xs('mammal') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> animal locomotion</span> |
| <span class="sd"> cat walks 4 0</span> |
| <span class="sd"> dog walks 4 0</span> |
| <span class="sd"> bat flies 2 2</span> |
| |
| <span class="sd"> Get values at several indexes</span> |
| |
| <span class="sd"> >>> df.xs(('mammal', 'dog')) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> locomotion</span> |
| <span class="sd"> walks 4 0</span> |
| |
| <span class="sd"> >>> df.xs(('mammal', 'dog', 'walks')) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs 4</span> |
| <span class="sd"> num_wings 0</span> |
| <span class="sd"> Name: (mammal, dog, walks), dtype: int64</span> |
| |
| <span class="sd"> Get values at specified index and level</span> |
| |
| <span class="sd"> >>> df.xs('cat', level=1) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> class locomotion</span> |
| <span class="sd"> mammal walks 4 0</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"'key' should be a scalar value or tuple that contains scalar values"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="n">key</span> <span class="o">=</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Key length (</span><span class="si">{}</span><span class="s2">) exceeds index depth (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="mi">0</span> |
| |
| <span class="n">rows</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">lvl</span><span class="p">]</span> <span class="o">==</span> <span class="n">index</span> <span class="k">for</span> <span class="n">lvl</span><span class="p">,</span> <span class="n">index</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> <span class="n">rows</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> <span class="c1"># type: DataFrame</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">transpose</span><span class="p">()))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> |
| <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="p">)</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">[:</span><span class="n">level</span><span class="p">]</span> <span class="o">+</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">[</span><span class="n">level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="p">:]</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.between_time"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.between_time.html#pyspark.pandas.DataFrame.between_time">[docs]</a> <span class="k">def</span> <span class="nf">between_time</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">start_time</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">datetime</span><span class="o">.</span><span class="n">time</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">end_time</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">datetime</span><span class="o">.</span><span class="n">time</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">include_start</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">include_end</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select values between particular times of the day (example: 9:00-9:30 AM).</span> |
| |
| <span class="sd"> By setting ``start_time`` to be later than ``end_time``,</span> |
| <span class="sd"> you can get the times that are *not* between the two times.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start_time : datetime.time or str</span> |
| <span class="sd"> Initial time as a time filter limit.</span> |
| <span class="sd"> end_time : datetime.time or str</span> |
| <span class="sd"> End time as a time filter limit.</span> |
| <span class="sd"> include_start : bool, default True</span> |
| <span class="sd"> Whether the start time needs to be included in the result.</span> |
| <span class="sd"> include_end : bool, default True</span> |
| <span class="sd"> Whether the end time needs to be included in the result.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| <span class="sd"> Determine range time on index or columns value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Data from the original object filtered to the specified dates range.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> at_time : Select values at a particular time of the day.</span> |
| <span class="sd"> first : Select initial periods of time series based on a date offset.</span> |
| <span class="sd"> last : Select final periods of time series based on a date offset.</span> |
| <span class="sd"> DatetimeIndex.indexer_between_time : Get just the index locations for</span> |
| <span class="sd"> values between particular times of the day.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> idx = pd.date_range('2018-04-09', periods=4, freq='1D20min')</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=idx)</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 00:00:00 1</span> |
| <span class="sd"> 2018-04-10 00:20:00 2</span> |
| <span class="sd"> 2018-04-11 00:40:00 3</span> |
| <span class="sd"> 2018-04-12 01:00:00 4</span> |
| |
| <span class="sd"> >>> psdf.between_time('0:15', '0:45')</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-10 00:20:00 2</span> |
| <span class="sd"> 2018-04-11 00:40:00 3</span> |
| |
| <span class="sd"> You get the times that are *not* between two times by setting</span> |
| <span class="sd"> ``start_time`` later than ``end_time``:</span> |
| |
| <span class="sd"> >>> psdf.between_time('0:45', '0:15')</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 00:00:00 1</span> |
| <span class="sd"> 2018-04-12 01:00:00 4</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"between_time currently only works for axis=0"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DatetimeIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Index must be DatetimeIndex"</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__index_name__"</span><span class="p">)</span> |
| <span class="n">return_types</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">dtype</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">dtypes</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_between_time</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">-></span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">[</span><span class="n">return_types</span><span class="p">]:</span> <span class="c1"># type: ignore</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">between_time</span><span class="p">(</span><span class="n">start_time</span><span class="p">,</span> <span class="n">end_time</span><span class="p">,</span> <span class="n">include_start</span><span class="p">,</span> <span class="n">include_end</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| |
| <span class="c1"># apply_batch will remove the index of the pandas-on-Spark DataFrame and attach a</span> |
| <span class="c1"># default index, which will never be used. So use "distributed" index as a dummy to</span> |
| <span class="c1"># avoid overhead.</span> |
| <span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.default_index_type"</span><span class="p">,</span> <span class="s2">"distributed"</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">apply_batch</span><span class="p">(</span><span class="n">pandas_between_time</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[:</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[:</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">1</span><span class="p">:],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">1</span><span class="p">:],</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: implement axis=1</span> |
| <div class="viewcode-block" id="DataFrame.at_time"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.at_time.html#pyspark.pandas.DataFrame.at_time">[docs]</a> <span class="k">def</span> <span class="nf">at_time</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">time</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">datetime</span><span class="o">.</span><span class="n">time</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">asof</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select values at particular time of day (example: 9:30AM).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> time : datetime.time or str</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> between_time : Select values between particular times of the day.</span> |
| <span class="sd"> DatetimeIndex.indexer_at_time : Get just the index locations for</span> |
| <span class="sd"> values at particular time of the day.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> idx = pd.date_range('2018-04-09', periods=4, freq='12H')</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=idx)</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 00:00:00 1</span> |
| <span class="sd"> 2018-04-09 12:00:00 2</span> |
| <span class="sd"> 2018-04-10 00:00:00 3</span> |
| <span class="sd"> 2018-04-10 12:00:00 4</span> |
| |
| <span class="sd"> >>> psdf.at_time('12:00')</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 12:00:00 2</span> |
| <span class="sd"> 2018-04-10 12:00:00 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">asof</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"'asof' argument is not supported"</span><span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"at_time currently only works for axis=0"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DatetimeIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Index must be DatetimeIndex"</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__index_name__"</span><span class="p">)</span> |
| <span class="n">return_types</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">dtype</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">dtypes</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">__version__</span><span class="p">)</span> <span class="o"><</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="s2">"0.24"</span><span class="p">):</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_at_time</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">-></span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">[</span><span class="n">return_types</span><span class="p">]:</span> <span class="c1"># type: ignore</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">at_time</span><span class="p">(</span><span class="n">time</span><span class="p">,</span> <span class="n">asof</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_at_time</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">-></span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">[</span><span class="n">return_types</span><span class="p">]:</span> <span class="c1"># type: ignore</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">at_time</span><span class="p">(</span><span class="n">time</span><span class="p">,</span> <span class="n">asof</span><span class="p">,</span> <span class="n">axis</span><span class="p">)</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| |
| <span class="c1"># apply_batch will remove the index of the pandas-on-Spark DataFrame and attach</span> |
| <span class="c1"># a default index, which will never be used. So use "distributed" index as a dummy</span> |
| <span class="c1"># to avoid overhead.</span> |
| <span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.default_index_type"</span><span class="p">,</span> <span class="s2">"distributed"</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">apply_batch</span><span class="p">(</span><span class="n">pandas_at_time</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[:</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[:</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">1</span><span class="p">:],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">1</span><span class="p">:],</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.where"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.where.html#pyspark.pandas.DataFrame.where">[docs]</a> <span class="k">def</span> <span class="nf">where</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">cond</span><span class="p">:</span> <span class="n">DataFrameOrSeries</span><span class="p">,</span> |
| <span class="n">other</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Replace values where the condition is False.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cond : boolean DataFrame</span> |
| <span class="sd"> Where cond is True, keep the original value. Where False,</span> |
| <span class="sd"> replace with corresponding value from other.</span> |
| <span class="sd"> other : scalar, DataFrame</span> |
| <span class="sd"> Entries where cond is False are replaced with corresponding value from other.</span> |
| <span class="sd"> axis : int, default None</span> |
| <span class="sd"> Can only be set to 0 at the moment for compatibility with pandas.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> df1 = ps.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]})</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]})</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 100</span> |
| <span class="sd"> 1 1 200</span> |
| <span class="sd"> 2 2 300</span> |
| <span class="sd"> 3 3 400</span> |
| <span class="sd"> 4 4 500</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 -100</span> |
| <span class="sd"> 1 -1 -200</span> |
| <span class="sd"> 2 -2 -300</span> |
| <span class="sd"> 3 -3 -400</span> |
| <span class="sd"> 4 -4 -500</span> |
| |
| <span class="sd"> >>> df1.where(df1 > 0).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 NaN 100.0</span> |
| <span class="sd"> 1 1.0 200.0</span> |
| <span class="sd"> 2 2.0 300.0</span> |
| <span class="sd"> 3 3.0 400.0</span> |
| <span class="sd"> 4 4.0 500.0</span> |
| |
| <span class="sd"> >>> df1.where(df1 > 1, 10).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 10 100</span> |
| <span class="sd"> 1 10 200</span> |
| <span class="sd"> 2 2 300</span> |
| <span class="sd"> 3 3 400</span> |
| <span class="sd"> 4 4 500</span> |
| |
| <span class="sd"> >>> df1.where(df1 > 1, df1 + 100).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 100 100</span> |
| <span class="sd"> 1 101 200</span> |
| <span class="sd"> 2 2 300</span> |
| <span class="sd"> 3 3 400</span> |
| <span class="sd"> 4 4 500</span> |
| |
| <span class="sd"> >>> df1.where(df1 > 1, df2).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 100</span> |
| <span class="sd"> 1 -1 200</span> |
| <span class="sd"> 2 2 300</span> |
| <span class="sd"> 3 3 400</span> |
| <span class="sd"> 4 4 500</span> |
| |
| <span class="sd"> When the column name of cond is different from self, it treats all values are False</span> |
| |
| <span class="sd"> >>> cond = ps.DataFrame({'C': [0, -1, -2, -3, -4], 'D':[4, 3, 2, 1, 0]}) % 3 == 0</span> |
| <span class="sd"> >>> cond</span> |
| <span class="sd"> C D</span> |
| <span class="sd"> 0 True False</span> |
| <span class="sd"> 1 False True</span> |
| <span class="sd"> 2 False False</span> |
| <span class="sd"> 3 True False</span> |
| <span class="sd"> 4 False True</span> |
| |
| <span class="sd"> >>> df1.where(cond).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 NaN NaN</span> |
| <span class="sd"> 1 NaN NaN</span> |
| <span class="sd"> 2 NaN NaN</span> |
| <span class="sd"> 3 NaN NaN</span> |
| <span class="sd"> 4 NaN NaN</span> |
| |
| <span class="sd"> When the type of cond is Series, it just check boolean regardless of column name</span> |
| |
| <span class="sd"> >>> cond = ps.Series([1, 2]) > 1</span> |
| <span class="sd"> >>> cond</span> |
| <span class="sd"> 0 False</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> dtype: bool</span> |
| |
| <span class="sd"> >>> df1.where(cond).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 NaN NaN</span> |
| <span class="sd"> 1 1.0 200.0</span> |
| <span class="sd"> 2 NaN NaN</span> |
| <span class="sd"> 3 NaN NaN</span> |
| <span class="sd"> 4 NaN NaN</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="n">tmp_cond_col_name</span> <span class="o">=</span> <span class="s2">"__tmp_cond_col_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span> |
| <span class="n">tmp_other_col_name</span> <span class="o">=</span> <span class="s2">"__tmp_other_col_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="n">tmp_cond_col_names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">tmp_cond_col_name</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span><span class="p">[</span> |
| <span class="p">[</span> |
| <span class="p">(</span> |
| <span class="n">cond</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">cond</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">else</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">tmp_cond_col_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_cond_col_names</span><span class="p">]</span> <span class="o">=</span> <span class="n">cond</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span><span class="p">[</span> |
| <span class="p">[</span><span class="n">cond</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">tmp_cond_col_names</span><span class="p">]</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_cond_col_names</span><span class="p">]</span> <span class="o">=</span> <span class="n">cond</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"type of cond must be a DataFrame or Series"</span><span class="p">)</span> |
| |
| <span class="n">tmp_other_col_names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">tmp_other_col_name</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">other</span> <span class="o">=</span> <span class="n">other</span><span class="p">[</span> |
| <span class="p">[</span> |
| <span class="p">(</span> |
| <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">else</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">tmp_other_col_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_other_col_names</span><span class="p">]</span> <span class="o">=</span> <span class="n">other</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">other</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="n">other</span> <span class="o">=</span> <span class="n">other</span><span class="p">[</span> |
| <span class="p">[</span><span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">tmp_other_col_names</span><span class="p">]</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_other_col_names</span><span class="p">]</span> <span class="o">=</span> <span class="n">other</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_other_col_name</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))]</span> <span class="o">=</span> <span class="n">other</span> |
| |
| <span class="c1"># above logic make spark dataframe looks like below:</span> |
| <span class="c1"># +-----------------+---+---+------------------+-------------------+------------------+--...</span> |
| <span class="c1"># |__index_level_0__| A| B|__tmp_cond_col_A__|__tmp_other_col_A__|__tmp_cond_col_B__|__...</span> |
| <span class="c1"># +-----------------+---+---+------------------+-------------------+------------------+--...</span> |
| <span class="c1"># | 0| 0|100| true| 0| false| ...</span> |
| <span class="c1"># | 1| 1|200| false| -1| false| ...</span> |
| <span class="c1"># | 3| 3|400| true| -3| false| ...</span> |
| <span class="c1"># | 2| 2|300| false| -2| true| ...</span> |
| <span class="c1"># | 4| 4|500| false| -4| false| ...</span> |
| <span class="c1"># +-----------------+---+---+------------------+-------------------+------------------+--...</span> |
| |
| <span class="n">data_spark_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">data_spark_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">tmp_cond_col_name</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">tmp_other_col_name</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="n">data_spark_columns</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="c1"># TODO: dtypes?</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.mask"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.mask.html#pyspark.pandas.DataFrame.mask">[docs]</a> <span class="k">def</span> <span class="nf">mask</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">cond</span><span class="p">:</span> <span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Replace values where the condition is True.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cond : boolean DataFrame</span> |
| <span class="sd"> Where cond is False, keep the original value. Where True,</span> |
| <span class="sd"> replace with corresponding value from other.</span> |
| <span class="sd"> other : scalar, DataFrame</span> |
| <span class="sd"> Entries where cond is True are replaced with corresponding value from other.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> df1 = ps.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]})</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]})</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 100</span> |
| <span class="sd"> 1 1 200</span> |
| <span class="sd"> 2 2 300</span> |
| <span class="sd"> 3 3 400</span> |
| <span class="sd"> 4 4 500</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 -100</span> |
| <span class="sd"> 1 -1 -200</span> |
| <span class="sd"> 2 -2 -300</span> |
| <span class="sd"> 3 -3 -400</span> |
| <span class="sd"> 4 -4 -500</span> |
| |
| <span class="sd"> >>> df1.mask(df1 > 0).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0.0 NaN</span> |
| <span class="sd"> 1 NaN NaN</span> |
| <span class="sd"> 2 NaN NaN</span> |
| <span class="sd"> 3 NaN NaN</span> |
| <span class="sd"> 4 NaN NaN</span> |
| |
| <span class="sd"> >>> df1.mask(df1 > 1, 10).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 10</span> |
| <span class="sd"> 1 1 10</span> |
| <span class="sd"> 2 10 10</span> |
| <span class="sd"> 3 10 10</span> |
| <span class="sd"> 4 10 10</span> |
| |
| <span class="sd"> >>> df1.mask(df1 > 1, df1 + 100).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 200</span> |
| <span class="sd"> 1 1 300</span> |
| <span class="sd"> 2 102 400</span> |
| <span class="sd"> 3 103 500</span> |
| <span class="sd"> 4 104 600</span> |
| |
| <span class="sd"> >>> df1.mask(df1 > 1, df2).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 0 -100</span> |
| <span class="sd"> 1 1 -200</span> |
| <span class="sd"> 2 -2 -300</span> |
| <span class="sd"> 3 -3 -400</span> |
| <span class="sd"> 4 -4 -500</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"type of cond must be a DataFrame or Series"</span><span class="p">)</span> |
| |
| <span class="n">cond_inversed</span> <span class="o">=</span> <span class="n">cond</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="o">~</span><span class="n">psser</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">cond_inversed</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Index"</span><span class="p">:</span> |
| <span class="sd">"""The index (row labels) Column of the DataFrame.</span> |
| |
| <span class="sd"> Currently not supported when the DataFrame has no index.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Index</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.base</span> <span class="kn">import</span> <span class="n">Index</span> |
| |
| <span class="k">return</span> <span class="n">Index</span><span class="o">.</span><span class="n">_new_instance</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">empty</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns true if the current DataFrame is empty. Otherwise, returns false.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(10).empty</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> >>> ps.range(0).empty</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> >>> ps.DataFrame({}, index=list('abc')).empty</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> |
| <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">isEmpty</span><span class="p">()</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">style</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Styler"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Property returning a Styler object containing methods for</span> |
| <span class="sd"> building a styled HTML representation for the DataFrame.</span> |
| |
| <span class="sd"> .. note:: currently it collects top 1000 rows and return its</span> |
| <span class="sd"> pandas `pandas.io.formats.style.Styler` instance.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.range(1001).style # doctest: +SKIP</span> |
| <span class="sd"> <pandas.io.formats.style.Styler object at ...></span> |
| <span class="sd"> """</span> |
| <span class="n">max_results</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.max_rows"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">max_results</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">></span> <span class="n">max_results</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"'style' property will only use top </span><span class="si">%s</span><span class="s2"> rows."</span> <span class="o">%</span> <span class="n">max_results</span><span class="p">,</span> <span class="ne">UserWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">max_results</span><span class="p">)</span><span class="o">.</span><span class="n">style</span> |
| |
| <div class="viewcode-block" id="DataFrame.set_index"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.set_index.html#pyspark.pandas.DataFrame.set_index">[docs]</a> <span class="k">def</span> <span class="nf">set_index</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">keys</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]],</span> |
| <span class="n">drop</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">append</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""Set the DataFrame index (row labels) using one or more existing columns.</span> |
| |
| <span class="sd"> Set the DataFrame index (row labels) using one or more existing</span> |
| <span class="sd"> columns or arrays (of the correct length). The index can replace the</span> |
| <span class="sd"> existing index or expand on it.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> keys : label or array-like or list of labels/arrays</span> |
| <span class="sd"> This parameter can be either a single column key, a single array of</span> |
| <span class="sd"> the same length as the calling DataFrame, or a list containing an</span> |
| <span class="sd"> arbitrary combination of column keys and arrays. Here, "array"</span> |
| <span class="sd"> encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.</span> |
| <span class="sd"> drop : bool, default True</span> |
| <span class="sd"> Delete columns to be used as the new index.</span> |
| <span class="sd"> append : bool, default False</span> |
| <span class="sd"> Whether to append columns to existing index.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Modify the DataFrame in place (do not create a new object).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Changed row labels.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.reset_index : Opposite of set_index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'month': [1, 4, 7, 10],</span> |
| <span class="sd"> ... 'year': [2012, 2014, 2013, 2014],</span> |
| <span class="sd"> ... 'sale': [55, 40, 84, 31]},</span> |
| <span class="sd"> ... columns=['month', 'year', 'sale'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> month year sale</span> |
| <span class="sd"> 0 1 2012 55</span> |
| <span class="sd"> 1 4 2014 40</span> |
| <span class="sd"> 2 7 2013 84</span> |
| <span class="sd"> 3 10 2014 31</span> |
| |
| <span class="sd"> Set the index to become the 'month' column:</span> |
| |
| <span class="sd"> >>> df.set_index('month') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> year sale</span> |
| <span class="sd"> month</span> |
| <span class="sd"> 1 2012 55</span> |
| <span class="sd"> 4 2014 40</span> |
| <span class="sd"> 7 2013 84</span> |
| <span class="sd"> 10 2014 31</span> |
| |
| <span class="sd"> Create a MultiIndex using columns 'year' and 'month':</span> |
| |
| <span class="sd"> >>> df.set_index(['year', 'month']) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> sale</span> |
| <span class="sd"> year month</span> |
| <span class="sd"> 2012 1 55</span> |
| <span class="sd"> 2014 4 40</span> |
| <span class="sd"> 2013 7 84</span> |
| <span class="sd"> 2014 10 31</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">keys</span><span class="p">):</span> |
| <span class="n">key_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">keys</span><span class="p">)]</span> <span class="c1"># type: List[Label]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">keys</span><span class="p">):</span> |
| <span class="n">key_list</span> <span class="o">=</span> <span class="p">[(</span><span class="n">keys</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">key_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">key</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">keys</span><span class="p">]</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">key_list</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">key</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">drop</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">key_list</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">append</span><span class="p">:</span> |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="p">[</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">key_list</span> |
| <span class="p">]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> <span class="o">+</span> <span class="n">key_list</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> <span class="o">+</span> <span class="p">[</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">field_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">key_list</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">key_list</span><span class="p">]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">key_list</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">field_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">key_list</span><span class="p">]</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">field_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.reset_index"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.reset_index.html#pyspark.pandas.DataFrame.reset_index">[docs]</a> <span class="k">def</span> <span class="nf">reset_index</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">drop</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">col_level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">col_fill</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">""</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""Reset the index, or a level of it.</span> |
| |
| <span class="sd"> For DataFrame with multi-level index, return new DataFrame with labeling information in</span> |
| <span class="sd"> the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None.</span> |
| <span class="sd"> For a standard index, the index name will be used (if set), otherwise a default 'index' or</span> |
| <span class="sd"> 'level_0' (if 'index' is already taken) will be used.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> level : int, str, tuple, or list, default None</span> |
| <span class="sd"> Only remove the given levels from the index. Removes all levels by</span> |
| <span class="sd"> default.</span> |
| <span class="sd"> drop : bool, default False</span> |
| <span class="sd"> Do not try to insert index into dataframe columns. This resets</span> |
| <span class="sd"> the index to the default integer index.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Modify the DataFrame in place (do not create a new object).</span> |
| <span class="sd"> col_level : int or str, default 0</span> |
| <span class="sd"> If the columns have multiple levels, determines which level the</span> |
| <span class="sd"> labels are inserted into. By default it is inserted into the first</span> |
| <span class="sd"> level.</span> |
| <span class="sd"> col_fill : object, default ''</span> |
| <span class="sd"> If the columns have multiple levels, determines how the other</span> |
| <span class="sd"> levels are named. If None then the index name is repeated.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with the new index.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.set_index : Opposite of reset_index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([('bird', 389.0),</span> |
| <span class="sd"> ... ('bird', 24.0),</span> |
| <span class="sd"> ... ('mammal', 80.5),</span> |
| <span class="sd"> ... ('mammal', np.nan)],</span> |
| <span class="sd"> ... index=['falcon', 'parrot', 'lion', 'monkey'],</span> |
| <span class="sd"> ... columns=('class', 'max_speed'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> class max_speed</span> |
| <span class="sd"> falcon bird 389.0</span> |
| <span class="sd"> parrot bird 24.0</span> |
| <span class="sd"> lion mammal 80.5</span> |
| <span class="sd"> monkey mammal NaN</span> |
| |
| <span class="sd"> When we reset the index, the old index is added as a column. Unlike pandas, pandas-on-Spark</span> |
| <span class="sd"> does not automatically add a sequential index. The following 0, 1, 2, 3 are only</span> |
| <span class="sd"> there when we display the DataFrame.</span> |
| |
| <span class="sd"> >>> df.reset_index()</span> |
| <span class="sd"> index class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 1 parrot bird 24.0</span> |
| <span class="sd"> 2 lion mammal 80.5</span> |
| <span class="sd"> 3 monkey mammal NaN</span> |
| |
| <span class="sd"> We can use the `drop` parameter to avoid the old index being added as</span> |
| <span class="sd"> a column:</span> |
| |
| <span class="sd"> >>> df.reset_index(drop=True)</span> |
| <span class="sd"> class max_speed</span> |
| <span class="sd"> 0 bird 389.0</span> |
| <span class="sd"> 1 bird 24.0</span> |
| <span class="sd"> 2 mammal 80.5</span> |
| <span class="sd"> 3 mammal NaN</span> |
| |
| <span class="sd"> You can also use `reset_index` with `MultiIndex`.</span> |
| |
| <span class="sd"> >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),</span> |
| <span class="sd"> ... ('bird', 'parrot'),</span> |
| <span class="sd"> ... ('mammal', 'lion'),</span> |
| <span class="sd"> ... ('mammal', 'monkey')],</span> |
| <span class="sd"> ... names=['class', 'name'])</span> |
| <span class="sd"> >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),</span> |
| <span class="sd"> ... ('species', 'type')])</span> |
| <span class="sd"> >>> df = ps.DataFrame([(389.0, 'fly'),</span> |
| <span class="sd"> ... ( 24.0, 'fly'),</span> |
| <span class="sd"> ... ( 80.5, 'run'),</span> |
| <span class="sd"> ... (np.nan, 'jump')],</span> |
| <span class="sd"> ... index=index,</span> |
| <span class="sd"> ... columns=columns)</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> speed species</span> |
| <span class="sd"> max type</span> |
| <span class="sd"> class name</span> |
| <span class="sd"> bird falcon 389.0 fly</span> |
| <span class="sd"> parrot 24.0 fly</span> |
| <span class="sd"> mammal lion 80.5 run</span> |
| <span class="sd"> monkey NaN jump</span> |
| |
| <span class="sd"> If the index has multiple levels, we can reset a subset of them:</span> |
| |
| <span class="sd"> >>> df.reset_index(level='class') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> class speed species</span> |
| <span class="sd"> max type</span> |
| <span class="sd"> name</span> |
| <span class="sd"> falcon bird 389.0 fly</span> |
| <span class="sd"> parrot bird 24.0 fly</span> |
| <span class="sd"> lion mammal 80.5 run</span> |
| <span class="sd"> monkey mammal NaN jump</span> |
| |
| <span class="sd"> If we are not dropping the index, by default, it is placed in the top</span> |
| <span class="sd"> level. We can place it in another level:</span> |
| |
| <span class="sd"> >>> df.reset_index(level='class', col_level=1) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> speed species</span> |
| <span class="sd"> class max type</span> |
| <span class="sd"> name</span> |
| <span class="sd"> falcon bird 389.0 fly</span> |
| <span class="sd"> parrot bird 24.0 fly</span> |
| <span class="sd"> lion mammal 80.5 run</span> |
| <span class="sd"> monkey mammal NaN jump</span> |
| |
| <span class="sd"> When the index is inserted under another level, we can specify under</span> |
| <span class="sd"> which one with the parameter `col_fill`:</span> |
| |
| <span class="sd"> >>> df.reset_index(level='class', col_level=1,</span> |
| <span class="sd"> ... col_fill='species') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> species speed species</span> |
| <span class="sd"> class max type</span> |
| <span class="sd"> name</span> |
| <span class="sd"> falcon bird 389.0 fly</span> |
| <span class="sd"> parrot bird 24.0 fly</span> |
| <span class="sd"> lion mammal 80.5 run</span> |
| <span class="sd"> monkey mammal NaN jump</span> |
| |
| <span class="sd"> If we specify a nonexistent level for `col_fill`, it is created:</span> |
| |
| <span class="sd"> >>> df.reset_index(level='class', col_level=1,</span> |
| <span class="sd"> ... col_fill='genus') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> genus speed species</span> |
| <span class="sd"> class max type</span> |
| <span class="sd"> name</span> |
| <span class="sd"> falcon bird 389.0 fly</span> |
| <span class="sd"> parrot bird 24.0 fly</span> |
| <span class="sd"> lion mammal 80.5 run</span> |
| <span class="sd"> monkey mammal NaN jump</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="n">multi_index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span> |
| |
| <span class="k">def</span> <span class="nf">rename</span><span class="p">(</span><span class="n">index</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Label</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">multi_index</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">(</span><span class="s2">"level_</span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">index</span><span class="p">),)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="p">(</span><span class="s2">"index"</span><span class="p">,)</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">(</span><span class="s2">"index"</span><span class="p">,)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">(</span><span class="s2">"level_</span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">index</span><span class="p">),)</span> |
| |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">name</span> <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">new_data_spark_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">new_column_labels</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">new_data_fields</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">level</span><span class="p">):</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]],</span> <span class="n">level</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">level</span><span class="p">):</span> |
| <span class="n">level_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Label</span><span class="p">],</span> <span class="n">level</span><span class="p">)]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">level</span><span class="p">):</span> |
| <span class="n">level_list</span> <span class="o">=</span> <span class="p">[(</span><span class="n">level</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">level_list</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">lvl</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">lvl</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">lvl</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">lvl</span><span class="p">,)</span> |
| <span class="k">for</span> <span class="n">lvl</span> <span class="ow">in</span> <span class="n">level</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">l</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">for</span> <span class="n">l</span> <span class="ow">in</span> <span class="n">level_list</span><span class="p">):</span> |
| <span class="n">int_level_list</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">level_list</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">lev</span> <span class="ow">in</span> <span class="n">int_level_list</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">lev</span> <span class="o">>=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">{}</span><span class="s2"> level, not </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">,</span> <span class="n">lev</span> <span class="o">+</span> <span class="mi">1</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">idx</span> <span class="o">=</span> <span class="n">int_level_list</span> |
| <span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">lev</span><span class="p">)</span> <span class="k">for</span> <span class="n">lev</span> <span class="ow">in</span> <span class="n">level_list</span><span class="p">):</span> |
| <span class="n">idx</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">l</span> <span class="ow">in</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">level_list</span><span class="p">):</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">i</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">l</span><span class="p">)</span> |
| <span class="n">idx</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">multi_index</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"Level unknown not found"</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Level unknown must be same as name (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">name_like_string</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Level should be all int or all string."</span><span class="p">)</span> |
| <span class="n">idx</span><span class="o">.</span><span class="n">sort</span><span class="p">()</span> |
| |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">new_data_spark_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">new_data_fields</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">idx</span><span class="p">[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]:</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">index_names</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">new_column_labels</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">name</span> <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">index_spark_columns</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">new_data_spark_columns</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">)))</span> |
| |
| <span class="n">new_data_fields</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="n">drop</span><span class="p">:</span> |
| <span class="n">new_data_spark_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">new_data_fields</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">new_column_labels</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"cannot insert </span><span class="si">{}</span><span class="s2">, already exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">column_depth</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">if</span> <span class="n">col_level</span> <span class="o">>=</span> <span class="n">column_depth</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">{}</span><span class="s2"> levels, not </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">column_depth</span><span class="p">,</span> <span class="n">col_level</span> <span class="o">+</span> <span class="mi">1</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">col_level</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="n">column_depth</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">new_column_labels</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Item must have length equal to number of levels."</span><span class="p">)</span> |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="nb">tuple</span><span class="p">(</span> |
| <span class="p">([</span><span class="n">col_fill</span><span class="p">]</span> <span class="o">*</span> <span class="n">col_level</span><span class="p">)</span> |
| <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="o">+</span> <span class="p">([</span><span class="n">col_fill</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">column_depth</span> <span class="o">-</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="n">col_level</span><span class="p">)))</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">new_column_labels</span> |
| <span class="p">]</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">new_column_labels</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="n">new_data_spark_columns</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">,</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">new_data_fields</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.isnull"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.isnull.html#pyspark.pandas.DataFrame.isnull">[docs]</a> <span class="k">def</span> <span class="nf">isnull</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Detects missing values for items in the current Dataframe.</span> |
| |
| <span class="sd"> Return a boolean same-sized Dataframe indicating if the values are NA.</span> |
| <span class="sd"> NA values, such as None or numpy.NaN, gets mapped to True values.</span> |
| <span class="sd"> Everything else gets mapped to False values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.notnull</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)])</span> |
| <span class="sd"> >>> df.isnull()</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 False False</span> |
| <span class="sd"> 1 False True</span> |
| <span class="sd"> 2 False True</span> |
| <span class="sd"> 3 False False</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([[None, 'bee', None], ['dog', None, 'fly']])</span> |
| <span class="sd"> >>> df.isnull()</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 0 True False True</span> |
| <span class="sd"> 1 False True False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">isnull</span><span class="p">())</span></div> |
| |
| <span class="n">isna</span> <span class="o">=</span> <span class="n">isnull</span> |
| |
| <div class="viewcode-block" id="DataFrame.notnull"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.notnull.html#pyspark.pandas.DataFrame.notnull">[docs]</a> <span class="k">def</span> <span class="nf">notnull</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Detects non-missing values for items in the current Dataframe.</span> |
| |
| <span class="sd"> This function takes a dataframe and indicates whether it's</span> |
| <span class="sd"> values are valid (not missing, which is ``NaN`` in numeric</span> |
| <span class="sd"> datatypes, ``None`` or ``NaN`` in objects and ``NaT`` in datetimelike).</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.isnull</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)])</span> |
| <span class="sd"> >>> df.notnull()</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 True True</span> |
| <span class="sd"> 1 True False</span> |
| <span class="sd"> 2 True False</span> |
| <span class="sd"> 3 True True</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])</span> |
| <span class="sd"> >>> df.notnull()</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 0 True True True</span> |
| <span class="sd"> 1 True False True</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">notnull</span><span class="p">())</span></div> |
| |
| <span class="n">notna</span> <span class="o">=</span> <span class="n">notnull</span> |
| |
| <div class="viewcode-block" id="DataFrame.insert"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.insert.html#pyspark.pandas.DataFrame.insert">[docs]</a> <span class="k">def</span> <span class="nf">insert</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">loc</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> |
| <span class="n">column</span><span class="p">:</span> <span class="n">Name</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">],</span> |
| <span class="n">allow_duplicates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Insert column into DataFrame at specified location.</span> |
| |
| <span class="sd"> Raises a ValueError if `column` is already contained in the DataFrame,</span> |
| <span class="sd"> unless `allow_duplicates` is set to True.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> loc : int</span> |
| <span class="sd"> Insertion index. Must verify 0 <= loc <= len(columns).</span> |
| <span class="sd"> column : str, number, or hashable object</span> |
| <span class="sd"> Label of the inserted column.</span> |
| <span class="sd"> value : int, Series, or array-like</span> |
| <span class="sd"> allow_duplicates : bool, optional</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame([1, 2, 3])</span> |
| <span class="sd"> >>> psdf.sort_index()</span> |
| <span class="sd"> 0</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> >>> psdf.insert(0, 'x', 4)</span> |
| <span class="sd"> >>> psdf.sort_index()</span> |
| <span class="sd"> x 0</span> |
| <span class="sd"> 0 4 1</span> |
| <span class="sd"> 1 4 2</span> |
| <span class="sd"> 2 4 3</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| |
| <span class="sd"> >>> psdf.insert(1, 'y', [5, 6, 7])</span> |
| <span class="sd"> >>> psdf.sort_index()</span> |
| <span class="sd"> x y 0</span> |
| <span class="sd"> 0 4 5 1</span> |
| <span class="sd"> 1 4 6 2</span> |
| <span class="sd"> 2 4 7 3</span> |
| |
| <span class="sd"> >>> psdf.insert(2, 'z', ps.Series([8, 9, 10]))</span> |
| <span class="sd"> >>> psdf.sort_index()</span> |
| <span class="sd"> x y z 0</span> |
| <span class="sd"> 0 4 5 8 1</span> |
| <span class="sd"> 1 4 6 9 2</span> |
| <span class="sd"> 2 4 7 10 3</span> |
| |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">loc</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"loc must be int"</span><span class="p">)</span> |
| |
| <span class="k">assert</span> <span class="mi">0</span> <span class="o"><=</span> <span class="n">loc</span> <span class="o"><=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> |
| <span class="k">assert</span> <span class="n">allow_duplicates</span> <span class="ow">is</span> <span class="kc">False</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">column</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s1">'"column" should be a scalar value or tuple that contains scalar values'</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">column</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">levels</span><span class="p">):</span> |
| <span class="c1"># To be consistent with pandas</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">'"column" must have length equal to number of column levels.'</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"cannot insert </span><span class="si">%s</span><span class="s2">, already exists"</span> <span class="o">%</span> <span class="n">column</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="n">loc</span><span class="p">,</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: add frep and axis parameter</span> |
| <div class="viewcode-block" id="DataFrame.shift"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.shift.html#pyspark.pandas.DataFrame.shift">[docs]</a> <span class="k">def</span> <span class="nf">shift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Shift DataFrame by desired number of periods.</span> |
| |
| <span class="sd"> .. note:: the current implementation of shift uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int</span> |
| <span class="sd"> Number of periods to shift. Can be positive or negative.</span> |
| <span class="sd"> fill_value : object, optional</span> |
| <span class="sd"> The scalar value to use for newly introduced missing values.</span> |
| <span class="sd"> The default depends on the dtype of self. For numeric data, np.nan is used.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Copy of input DataFrame, shifted.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'Col1': [10, 20, 15, 30, 45],</span> |
| <span class="sd"> ... 'Col2': [13, 23, 18, 33, 48],</span> |
| <span class="sd"> ... 'Col3': [17, 27, 22, 37, 52]},</span> |
| <span class="sd"> ... columns=['Col1', 'Col2', 'Col3'])</span> |
| |
| <span class="sd"> >>> df.shift(periods=3)</span> |
| <span class="sd"> Col1 Col2 Col3</span> |
| <span class="sd"> 0 NaN NaN NaN</span> |
| <span class="sd"> 1 NaN NaN NaN</span> |
| <span class="sd"> 2 NaN NaN NaN</span> |
| <span class="sd"> 3 10.0 13.0 17.0</span> |
| <span class="sd"> 4 20.0 23.0 27.0</span> |
| |
| <span class="sd"> >>> df.shift(periods=3, fill_value=0)</span> |
| <span class="sd"> Col1 Col2 Col3</span> |
| <span class="sd"> 0 0 0 0</span> |
| <span class="sd"> 1 0 0 0</span> |
| <span class="sd"> 2 0 0 0</span> |
| <span class="sd"> 3 10 13 17</span> |
| <span class="sd"> 4 20 23 27</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: axis should support 1 or 'columns' either at this moment</span> |
| <div class="viewcode-block" id="DataFrame.diff"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.diff.html#pyspark.pandas.DataFrame.diff">[docs]</a> <span class="k">def</span> <span class="nf">diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> First discrete difference of element.</span> |
| |
| <span class="sd"> Calculates the difference of a DataFrame element compared with another element in the</span> |
| <span class="sd"> DataFrame (default is the element in the same column of the previous row).</span> |
| |
| <span class="sd"> .. note:: the current implementation of diff uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int, default 1</span> |
| <span class="sd"> Periods to shift for calculating difference, accepts negative values.</span> |
| <span class="sd"> axis : int, default 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> diffed : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'b': [1, 1, 2, 3, 5, 8],</span> |
| <span class="sd"> ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 1 1</span> |
| <span class="sd"> 1 2 1 4</span> |
| <span class="sd"> 2 3 2 9</span> |
| <span class="sd"> 3 4 3 16</span> |
| <span class="sd"> 4 5 5 25</span> |
| <span class="sd"> 5 6 8 36</span> |
| |
| <span class="sd"> >>> df.diff()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 NaN NaN NaN</span> |
| <span class="sd"> 1 1.0 0.0 3.0</span> |
| <span class="sd"> 2 1.0 1.0 5.0</span> |
| <span class="sd"> 3 1.0 1.0 7.0</span> |
| <span class="sd"> 4 1.0 2.0 9.0</span> |
| <span class="sd"> 5 1.0 3.0 11.0</span> |
| |
| <span class="sd"> Difference with previous column</span> |
| |
| <span class="sd"> >>> df.diff(periods=3)</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 NaN NaN NaN</span> |
| <span class="sd"> 1 NaN NaN NaN</span> |
| <span class="sd"> 2 NaN NaN NaN</span> |
| <span class="sd"> 3 3.0 2.0 15.0</span> |
| <span class="sd"> 4 3.0 4.0 21.0</span> |
| <span class="sd"> 5 3.0 6.0 27.0</span> |
| |
| <span class="sd"> Difference with following row</span> |
| |
| <span class="sd"> >>> df.diff(periods=-1)</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 -1.0 0.0 -3.0</span> |
| <span class="sd"> 1 -1.0 -1.0 -5.0</span> |
| <span class="sd"> 2 -1.0 -1.0 -7.0</span> |
| <span class="sd"> 3 -1.0 -2.0 -9.0</span> |
| <span class="sd"> 4 -1.0 -3.0 -11.0</span> |
| <span class="sd"> 5 NaN NaN NaN</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_diff</span><span class="p">(</span><span class="n">periods</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: axis should support 1 or 'columns' either at this moment</span> |
| <div class="viewcode-block" id="DataFrame.nunique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.nunique.html#pyspark.pandas.DataFrame.nunique">[docs]</a> <span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">approx</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">rsd</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return number of unique elements in the object.</span> |
| |
| <span class="sd"> Excludes NA values by default.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : int, default 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| <span class="sd"> dropna : bool, default True</span> |
| <span class="sd"> Don’t include NaN in the count.</span> |
| <span class="sd"> approx: bool, default False</span> |
| <span class="sd"> If False, will use the exact algorithm and return the exact number of unique.</span> |
| <span class="sd"> If True, it uses the HyperLogLog approximate algorithm, which is significantly faster</span> |
| <span class="sd"> for large amount of data.</span> |
| <span class="sd"> Note: This parameter is specific to pandas-on-Spark and is not found in pandas.</span> |
| <span class="sd"> rsd: float, default 0.05</span> |
| <span class="sd"> Maximum estimation error allowed in the HyperLogLog algorithm.</span> |
| <span class="sd"> Note: Just like ``approx`` this parameter is specific to pandas-on-Spark.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> The number of unique values per column as a pandas-on-Spark Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [np.nan, 3, np.nan]})</span> |
| <span class="sd"> >>> df.nunique()</span> |
| <span class="sd"> A 3</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> df.nunique(dropna=False)</span> |
| <span class="sd"> A 3</span> |
| <span class="sd"> B 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> On big data, we recommend using the approximate algorithm to speed up this function.</span> |
| <span class="sd"> The result will be very close to the exact unique count.</span> |
| |
| <span class="sd"> >>> df.nunique(approx=True)</span> |
| <span class="sd"> A 3</span> |
| <span class="sd"> B 1</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">StringType</span><span class="p">())</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">_nunique</span><span class="p">(</span><span class="n">dropna</span><span class="p">,</span> <span class="n">approx</span><span class="p">,</span> <span class="n">rsd</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># The data is expected to be small so it's fine to transpose/use default index.</span> |
| <span class="k">with</span> <span class="n">ps</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.max_rows"</span><span class="p">,</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">())</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.round"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.round.html#pyspark.pandas.DataFrame.round">[docs]</a> <span class="k">def</span> <span class="nf">round</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">decimals</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="s2">"Series"</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Round a DataFrame to a variable number of decimal places.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> decimals : int, dict, Series</span> |
| <span class="sd"> Number of decimal places to round each column to. If an int is</span> |
| <span class="sd"> given, round each column to the same number of places.</span> |
| <span class="sd"> Otherwise dict and Series round to variable numbers of places.</span> |
| <span class="sd"> Column names should be in the keys if `decimals` is a</span> |
| <span class="sd"> dict-like, or in the index if `decimals` is a Series. Any</span> |
| <span class="sd"> columns not included in `decimals` will be left as is. Elements</span> |
| <span class="sd"> of `decimals` which are not columns of the input will be</span> |
| <span class="sd"> ignored.</span> |
| |
| <span class="sd"> .. note:: If `decimals` is a Series, it is expected to be small,</span> |
| <span class="sd"> as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.round</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A':[0.028208, 0.038683, 0.877076],</span> |
| <span class="sd"> ... 'B':[0.992815, 0.645646, 0.149370],</span> |
| <span class="sd"> ... 'C':[0.173891, 0.577595, 0.491027]},</span> |
| <span class="sd"> ... columns=['A', 'B', 'C'],</span> |
| <span class="sd"> ... index=['first', 'second', 'third'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> first 0.028208 0.992815 0.173891</span> |
| <span class="sd"> second 0.038683 0.645646 0.577595</span> |
| <span class="sd"> third 0.877076 0.149370 0.491027</span> |
| |
| <span class="sd"> >>> df.round(2)</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> first 0.03 0.99 0.17</span> |
| <span class="sd"> second 0.04 0.65 0.58</span> |
| <span class="sd"> third 0.88 0.15 0.49</span> |
| |
| <span class="sd"> >>> df.round({'A': 1, 'C': 2})</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> first 0.0 0.992815 0.17</span> |
| <span class="sd"> second 0.0 0.645646 0.58</span> |
| <span class="sd"> third 0.9 0.149370 0.49</span> |
| |
| <span class="sd"> >>> decimals = ps.Series([1, 0, 2], index=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> df.round(decimals)</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> first 0.0 1.0 0.17</span> |
| <span class="sd"> second 0.0 1.0 0.58</span> |
| <span class="sd"> third 0.9 0.0 0.49</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">decimals</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">decimals_dict</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">k</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">k</span><span class="p">,):</span> <span class="n">v</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">decimals</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">}</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">decimals</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">decimals_dict</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">k</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">k</span><span class="p">,):</span> <span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">decimals</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">decimals</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="n">decimals_dict</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">decimals</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">}</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"decimals must be an integer, a dict-like or a Series"</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">op</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">Column</span><span class="p">]:</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">decimals_dict</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">round</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">decimals_dict</span><span class="p">[</span><span class="n">label</span><span class="p">])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_mark_duplicates</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">keep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"first"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">SparkDataFrame</span><span class="p">,</span> <span class="nb">str</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="n">subset</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">subset_list</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">subset</span><span class="p">):</span> |
| <span class="n">subset_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">subset</span><span class="p">)]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">subset</span><span class="p">):</span> |
| <span class="n">subset_list</span> <span class="o">=</span> <span class="p">[(</span><span class="n">subset</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">subset_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">sub</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">sub</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">sub</span><span class="p">,)</span> <span class="k">for</span> <span class="n">sub</span> <span class="ow">in</span> <span class="n">subset</span><span class="p">]</span> |
| <span class="n">diff</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">subset_list</span><span class="p">)</span><span class="o">.</span><span class="n">difference</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">diff</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">d</span><span class="p">)</span> <span class="k">for</span> <span class="n">d</span> <span class="ow">in</span> <span class="n">diff</span><span class="p">]))</span> |
| <span class="n">group_cols</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">subset_list</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="n">column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__duplicated__"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">keep</span> <span class="o">==</span> <span class="s2">"first"</span> <span class="ow">or</span> <span class="n">keep</span> <span class="o">==</span> <span class="s2">"last"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">keep</span> <span class="o">==</span> <span class="s2">"first"</span><span class="p">:</span> |
| <span class="n">ord_func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">asc</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">ord_func</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">desc</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">group_cols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">ord_func</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">currentRow</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="ow">not</span> <span class="n">keep</span><span class="p">:</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">group_cols</span><span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">unboundedFollowing</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="s2">"*"</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"'keep' only supports 'first', 'last' and False"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">sdf</span><span class="p">,</span> <span class="n">column</span> |
| |
| <div class="viewcode-block" id="DataFrame.duplicated"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.duplicated.html#pyspark.pandas.DataFrame.duplicated">[docs]</a> <span class="k">def</span> <span class="nf">duplicated</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">keep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"first"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return boolean Series denoting duplicate rows, optionally only considering certain columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> subset : column label or sequence of labels, optional</span> |
| <span class="sd"> Only consider certain columns for identifying duplicates,</span> |
| <span class="sd"> by default use all of the columns</span> |
| <span class="sd"> keep : {'first', 'last', False}, default 'first'</span> |
| <span class="sd"> - ``first`` : Mark duplicates as ``True`` except for the first occurrence.</span> |
| <span class="sd"> - ``last`` : Mark duplicates as ``True`` except for the last occurrence.</span> |
| <span class="sd"> - False : Mark all duplicates as ``True``.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> duplicated : Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 3], 'b': [1, 1, 1, 4], 'c': [1, 1, 1, 5]},</span> |
| <span class="sd"> ... columns = ['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 1 1</span> |
| <span class="sd"> 1 1 1 1</span> |
| <span class="sd"> 2 1 1 1</span> |
| <span class="sd"> 3 3 4 5</span> |
| |
| <span class="sd"> >>> df.duplicated().sort_index()</span> |
| <span class="sd"> 0 False</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> dtype: bool</span> |
| |
| <span class="sd"> Mark duplicates as ``True`` except for the last occurrence.</span> |
| |
| <span class="sd"> >>> df.duplicated(keep='last').sort_index()</span> |
| <span class="sd"> 0 True</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> dtype: bool</span> |
| |
| <span class="sd"> Mark all duplicates as ``True``.</span> |
| |
| <span class="sd"> >>> df.duplicated(keep=False).sort_index()</span> |
| <span class="sd"> 0 True</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">sdf</span><span class="p">,</span> <span class="n">column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_mark_duplicates</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="n">keep</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">)]</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> <span class="c1"># type: ignore</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: support other as DataFrame or array-like</span> |
| <div class="viewcode-block" id="DataFrame.dot"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.dot.html#pyspark.pandas.DataFrame.dot">[docs]</a> <span class="k">def</span> <span class="nf">dot</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute the matrix multiplication between the DataFrame and other.</span> |
| |
| <span class="sd"> This method computes the matrix product between the DataFrame and the</span> |
| <span class="sd"> values of an other Series</span> |
| |
| <span class="sd"> It can also be called using ``self @ other`` in Python >= 3.5.</span> |
| |
| <span class="sd"> .. note:: This method is based on an expensive operation due to the nature</span> |
| <span class="sd"> of big data. Internally it needs to generate each row for each value, and</span> |
| <span class="sd"> then group twice - it is a huge operation. To prevent misusage, this method</span> |
| <span class="sd"> has the 'compute.max_rows' default limit of input length, and raises a ValueError.</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import option_context</span> |
| <span class="sd"> >>> with option_context(</span> |
| <span class="sd"> ... 'compute.max_rows', 1000, "compute.ops_on_diff_frames", True</span> |
| <span class="sd"> ... ): # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> ... psdf = ps.DataFrame({'a': range(1001)})</span> |
| <span class="sd"> ... psser = ps.Series([2], index=['a'])</span> |
| <span class="sd"> ... psdf.dot(psser)</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: Current DataFrame has more then the given limit 1000 rows.</span> |
| <span class="sd"> Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option'</span> |
| <span class="sd"> to retrieve to retrieve more than 1000 rows. Note that, before changing the</span> |
| <span class="sd"> 'compute.max_rows', this operation is considerably expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : Series</span> |
| <span class="sd"> The other object to compute the matrix product with.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Return the matrix product between self and other as a Series.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.dot: Similar method for Series.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The dimensions of DataFrame and other must be compatible in order to</span> |
| <span class="sd"> compute the matrix multiplication. In addition, the column names of</span> |
| <span class="sd"> DataFrame and the index of other must contain the same values, as they</span> |
| <span class="sd"> will be aligned prior to the multiplication.</span> |
| |
| <span class="sd"> The dot method for Series computes the inner product, instead of the</span> |
| <span class="sd"> matrix product here.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.pandas.config import set_option, reset_option</span> |
| <span class="sd"> >>> set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> psdf = ps.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])</span> |
| <span class="sd"> >>> psser = ps.Series([1, 1, 2, 1])</span> |
| <span class="sd"> >>> psdf.dot(psser)</span> |
| <span class="sd"> 0 -4</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Note how shuffling of the objects does not change the result.</span> |
| |
| <span class="sd"> >>> psser2 = psser.reindex([1, 0, 2, 3])</span> |
| <span class="sd"> >>> psdf.dot(psser2)</span> |
| <span class="sd"> 0 -4</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>> psdf @ psser2</span> |
| <span class="sd"> 0 -4</span> |
| <span class="sd"> 1 5</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>> reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">transpose</span><span class="p">()))</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="fm">__matmul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Matrix multiplication using binary `@` operator in Python>=3.5.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dot</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.to_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_table.html#pyspark.pandas.DataFrame.to_table">[docs]</a> <span class="k">def</span> <span class="nf">to_table</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"w"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">to_table</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">mode</span><span class="p">,</span> <span class="n">partition_cols</span><span class="p">,</span> <span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div> |
| |
| <span class="n">to_table</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">SparkFrameMethods</span><span class="o">.</span><span class="n">to_table</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrame.to_delta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_delta.html#pyspark.pandas.DataFrame.to_delta">[docs]</a> <span class="k">def</span> <span class="nf">to_delta</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"w"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="s2">"OptionalPrimitiveType"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Write the DataFrame out as a Delta Lake table.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : str, required</span> |
| <span class="sd"> Path to write to.</span> |
| <span class="sd"> mode : str</span> |
| <span class="sd"> Python write mode, default 'w'.</span> |
| |
| <span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span> |
| <span class="sd"> Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.</span> |
| |
| <span class="sd"> - 'append' (equivalent to 'a'): Append the new data to existing data.</span> |
| <span class="sd"> - 'overwrite' (equivalent to 'w'): Overwrite existing data.</span> |
| <span class="sd"> - 'ignore': Silently ignore this operation if data already exists.</span> |
| <span class="sd"> - 'error' or 'errorifexists': Throw an exception if data already exists.</span> |
| |
| <span class="sd"> partition_cols : str or list of str, optional, default None</span> |
| <span class="sd"> Names of partitioning columns</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Column names to be used in Spark to represent pandas-on-Spark's index. The index name</span> |
| <span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Delta Lake.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_delta</span> |
| <span class="sd"> DataFrame.to_parquet</span> |
| <span class="sd"> DataFrame.to_table</span> |
| <span class="sd"> DataFrame.to_spark_io</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame(dict(</span> |
| <span class="sd"> ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),</span> |
| <span class="sd"> ... country=['KR', 'US', 'JP'],</span> |
| <span class="sd"> ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> date country code</span> |
| <span class="sd"> 0 2012-01-31 12:00:00 KR 1</span> |
| <span class="sd"> 1 2012-02-29 12:00:00 US 2</span> |
| <span class="sd"> 2 2012-03-31 12:00:00 JP 3</span> |
| |
| <span class="sd"> Create a new Delta Lake table, partitioned by one column:</span> |
| |
| <span class="sd"> >>> df.to_delta('%s/to_delta/foo' % path, partition_cols='date') # doctest: +SKIP</span> |
| |
| <span class="sd"> Partitioned by two columns:</span> |
| |
| <span class="sd"> >>> df.to_delta('%s/to_delta/bar' % path,</span> |
| <span class="sd"> ... partition_cols=['date', 'country']) # doctest: +SKIP</span> |
| |
| <span class="sd"> Overwrite an existing table's partitions, using the 'replaceWhere' capability in Delta:</span> |
| |
| <span class="sd"> >>> df.to_delta('%s/to_delta/bar' % path,</span> |
| <span class="sd"> ... mode='overwrite', replaceWhere='date >= "2012-01-01"') # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> <span class="c1"># type: ignore</span> |
| |
| <span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">to_spark_io</span><span class="p">(</span> |
| <span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> |
| <span class="n">mode</span><span class="o">=</span><span class="n">mode</span><span class="p">,</span> |
| <span class="nb">format</span><span class="o">=</span><span class="s2">"delta"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="o">=</span><span class="n">partition_cols</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_parquet"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_parquet.html#pyspark.pandas.DataFrame.to_parquet">[docs]</a> <span class="k">def</span> <span class="nf">to_parquet</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"w"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">compression</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Write the DataFrame out as a Parquet file or directory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : str, required</span> |
| <span class="sd"> Path to write to.</span> |
| <span class="sd"> mode : str</span> |
| <span class="sd"> Python write mode, default 'w'.</span> |
| |
| <span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span> |
| <span class="sd"> Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.</span> |
| |
| <span class="sd"> - 'append' (equivalent to 'a'): Append the new data to existing data.</span> |
| <span class="sd"> - 'overwrite' (equivalent to 'w'): Overwrite existing data.</span> |
| <span class="sd"> - 'ignore': Silently ignore this operation if data already exists.</span> |
| <span class="sd"> - 'error' or 'errorifexists': Throw an exception if data already exists.</span> |
| |
| <span class="sd"> partition_cols : str or list of str, optional, default None</span> |
| <span class="sd"> Names of partitioning columns</span> |
| <span class="sd"> compression : str {'none', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'lz4', 'zstd'}</span> |
| <span class="sd"> Compression codec to use when saving to file. If None is set, it uses the</span> |
| <span class="sd"> value specified in `spark.sql.parquet.compression.codec`.</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Column names to be used in Spark to represent pandas-on-Spark's index. The index name</span> |
| <span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_parquet</span> |
| <span class="sd"> DataFrame.to_delta</span> |
| <span class="sd"> DataFrame.to_table</span> |
| <span class="sd"> DataFrame.to_spark_io</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(dict(</span> |
| <span class="sd"> ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),</span> |
| <span class="sd"> ... country=['KR', 'US', 'JP'],</span> |
| <span class="sd"> ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> date country code</span> |
| <span class="sd"> 0 2012-01-31 12:00:00 KR 1</span> |
| <span class="sd"> 1 2012-02-29 12:00:00 US 2</span> |
| <span class="sd"> 2 2012-03-31 12:00:00 JP 3</span> |
| |
| <span class="sd"> >>> df.to_parquet('%s/to_parquet/foo.parquet' % path, partition_cols='date')</span> |
| |
| <span class="sd"> >>> df.to_parquet(</span> |
| <span class="sd"> ... '%s/to_parquet/foo.parquet' % path,</span> |
| <span class="sd"> ... mode = 'overwrite',</span> |
| <span class="sd"> ... partition_cols=['date', 'country'])</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> <span class="c1"># type: ignore</span> |
| |
| <span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="n">builder</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">compression</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">"compression"</span><span class="p">,</span> <span class="n">compression</span><span class="p">)</span> |
| <span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">"parquet"</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_orc"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_orc.html#pyspark.pandas.DataFrame.to_orc">[docs]</a> <span class="k">def</span> <span class="nf">to_orc</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"w"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="s2">"OptionalPrimitiveType"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Write the DataFrame out as a ORC file or directory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> path : str, required</span> |
| <span class="sd"> Path to write to.</span> |
| <span class="sd"> mode : str</span> |
| <span class="sd"> Python write mode, default 'w'.</span> |
| |
| <span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span> |
| <span class="sd"> Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'.</span> |
| |
| <span class="sd"> - 'append' (equivalent to 'a'): Append the new data to existing data.</span> |
| <span class="sd"> - 'overwrite' (equivalent to 'w'): Overwrite existing data.</span> |
| <span class="sd"> - 'ignore': Silently ignore this operation if data already exists.</span> |
| <span class="sd"> - 'error' or 'errorifexists': Throw an exception if data already exists.</span> |
| |
| <span class="sd"> partition_cols : str or list of str, optional, default None</span> |
| <span class="sd"> Names of partitioning columns</span> |
| <span class="sd"> index_col: str or list of str, optional, default: None</span> |
| <span class="sd"> Column names to be used in Spark to represent pandas-on-Spark's index. The index name</span> |
| <span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span> |
| <span class="sd"> options : dict</span> |
| <span class="sd"> All other options passed directly into Spark's data source.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> read_orc</span> |
| <span class="sd"> DataFrame.to_delta</span> |
| <span class="sd"> DataFrame.to_parquet</span> |
| <span class="sd"> DataFrame.to_table</span> |
| <span class="sd"> DataFrame.to_spark_io</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(dict(</span> |
| <span class="sd"> ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),</span> |
| <span class="sd"> ... country=['KR', 'US', 'JP'],</span> |
| <span class="sd"> ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> date country code</span> |
| <span class="sd"> 0 2012-01-31 12:00:00 KR 1</span> |
| <span class="sd"> 1 2012-02-29 12:00:00 US 2</span> |
| <span class="sd"> 2 2012-03-31 12:00:00 JP 3</span> |
| |
| <span class="sd"> >>> df.to_orc('%s/to_orc/foo.orc' % path, partition_cols='date')</span> |
| |
| <span class="sd"> >>> df.to_orc(</span> |
| <span class="sd"> ... '%s/to_orc/foo.orc' % path,</span> |
| <span class="sd"> ... mode = 'overwrite',</span> |
| <span class="sd"> ... partition_cols=['date', 'country'])</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="s2">"options"</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"options"</span><span class="p">)</span> <span class="c1"># type: ignore</span> |
| |
| <span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">to_spark_io</span><span class="p">(</span> |
| <span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> |
| <span class="n">mode</span><span class="o">=</span><span class="n">mode</span><span class="p">,</span> |
| <span class="nb">format</span><span class="o">=</span><span class="s2">"orc"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="o">=</span><span class="n">partition_cols</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_spark_io"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_spark_io.html#pyspark.pandas.DataFrame.to_spark_io">[docs]</a> <span class="k">def</span> <span class="nf">to_spark_io</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"overwrite"</span><span class="p">,</span> |
| <span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="s2">"OptionalPrimitiveType"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""An alias for :func:`DataFrame.spark.to_spark_io`.</span> |
| <span class="sd"> See :meth:`pyspark.pandas.spark.accessors.SparkFrameMethods.to_spark_io`.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`DataFrame.spark.to_spark_io` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 3.2, Use DataFrame.spark.to_spark_io instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">to_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">mode</span><span class="p">,</span> <span class="n">partition_cols</span><span class="p">,</span> <span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div> |
| |
| <span class="n">to_spark_io</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">SparkFrameMethods</span><span class="o">.</span><span class="n">to_spark_io</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrame.to_spark"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_spark.html#pyspark.pandas.DataFrame.to_spark">[docs]</a> <span class="k">def</span> <span class="nf">to_spark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">SparkDataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">frame</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span></div> |
| |
| <span class="n">to_spark</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">SparkFrameMethods</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrame.to_pandas"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_pandas.html#pyspark.pandas.DataFrame.to_pandas">[docs]</a> <span class="k">def</span> <span class="nf">to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a pandas DataFrame.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting pandas DataFrame is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],</span> |
| <span class="sd"> ... columns=['dogs', 'cats'])</span> |
| <span class="sd"> >>> df.to_pandas()</span> |
| <span class="sd"> dogs cats</span> |
| <span class="sd"> 0 0.2 0.3</span> |
| <span class="sd"> 1 0.0 0.6</span> |
| <span class="sd"> 2 0.6 0.0</span> |
| <span class="sd"> 3 0.2 0.1</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">to_pandas_frame</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.assign"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.assign.html#pyspark.pandas.DataFrame.assign">[docs]</a> <span class="k">def</span> <span class="nf">assign</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Assign new columns to a DataFrame.</span> |
| |
| <span class="sd"> Returns a new object with all original columns in addition to new ones.</span> |
| <span class="sd"> Existing columns that are re-assigned will be overwritten.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> **kwargs : dict of {str: callable, Series or Index}</span> |
| <span class="sd"> The column names are keywords. If the values are</span> |
| <span class="sd"> callable, they are computed on the DataFrame and</span> |
| <span class="sd"> assigned to the new columns. The callable must not</span> |
| <span class="sd"> change input DataFrame (though pandas-on-Spark doesn't check it).</span> |
| <span class="sd"> If the values are not callable, (e.g. a Series or a literal),</span> |
| <span class="sd"> they are simply assigned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A new DataFrame with the new columns in addition to</span> |
| <span class="sd"> all the existing columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'temp_c': [17.0, 25.0]},</span> |
| <span class="sd"> ... index=['Portland', 'Berkeley'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> temp_c</span> |
| <span class="sd"> Portland 17.0</span> |
| <span class="sd"> Berkeley 25.0</span> |
| |
| <span class="sd"> Where the value is a callable, evaluated on `df`:</span> |
| |
| <span class="sd"> >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)</span> |
| <span class="sd"> temp_c temp_f</span> |
| <span class="sd"> Portland 17.0 62.6</span> |
| <span class="sd"> Berkeley 25.0 77.0</span> |
| |
| <span class="sd"> Alternatively, the same behavior can be achieved by directly</span> |
| <span class="sd"> referencing an existing Series or sequence and you can also</span> |
| <span class="sd"> create multiple columns within the same assign.</span> |
| |
| <span class="sd"> >>> assigned = df.assign(temp_f=df['temp_c'] * 9 / 5 + 32,</span> |
| <span class="sd"> ... temp_k=df['temp_c'] + 273.15,</span> |
| <span class="sd"> ... temp_idx=df.index)</span> |
| <span class="sd"> >>> assigned[['temp_c', 'temp_f', 'temp_k', 'temp_idx']]</span> |
| <span class="sd"> temp_c temp_f temp_k temp_idx</span> |
| <span class="sd"> Portland 17.0 62.6 290.15 Portland</span> |
| <span class="sd"> Berkeley 25.0 77.0 298.15 Berkeley</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Assigning multiple columns within the same ``assign`` is possible</span> |
| <span class="sd"> but you cannot refer to newly created or modified columns. This</span> |
| <span class="sd"> feature is supported in pandas for Python 3.6 and later but not in</span> |
| <span class="sd"> pandas-on-Spark. In pandas-on-Spark, all items are computed first,</span> |
| <span class="sd"> and then assigned.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_assign</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_assign</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">kwargs</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">MultiIndex</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">IndexOpsMixin</span> |
| |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">is_invalid_assignee</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="ow">not</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="p">(</span><span class="n">IndexOpsMixin</span><span class="p">,</span> <span class="n">Column</span><span class="p">))</span> <span class="ow">or</span> <span class="n">callable</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_scalar</span><span class="p">(</span><span class="n">v</span><span class="p">))</span> |
| <span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_invalid_assignee</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Column assignment doesn't support type "</span> <span class="s2">"</span><span class="si">{0}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">v</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">callable</span><span class="p">(</span><span class="n">v</span><span class="p">):</span> |
| <span class="n">kwargs</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="n">pairs</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="p">(</span><span class="n">k</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">k</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">k</span><span class="p">,)):</span> <span class="p">(</span> |
| <span class="p">(</span><span class="n">v</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">v</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">)</span> |
| <span class="k">else</span> <span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">),</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">}</span> |
| |
| <span class="n">scols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)):</span> |
| <span class="k">if</span> <span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">-</span> <span class="n">i</span><span class="p">]</span> <span class="ow">in</span> <span class="n">pairs</span><span class="p">:</span> |
| <span class="n">scol</span><span class="p">,</span> <span class="n">field</span> <span class="o">=</span> <span class="n">pairs</span><span class="p">[</span><span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">-</span> <span class="n">i</span><span class="p">]]</span> |
| |
| <span class="n">name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">field</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">field</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">break</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">field</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">field_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">scols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="n">data_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">field</span><span class="p">)</span> <span class="ow">in</span> <span class="n">pairs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="nb">set</span><span class="p">(</span><span class="n">i</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">scols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">field</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">field</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="n">data_fields</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> |
| |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="n">level</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">([</span><span class="s2">""</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">level</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">))))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">]</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="n">scols</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.from_records"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.from_records.html#pyspark.pandas.DataFrame.from_records">[docs]</a> <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">from_records</span><span class="p">(</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">tuple</span><span class="p">],</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">exclude</span><span class="p">:</span> <span class="nb">list</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="nb">list</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">coerce_float</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">nrows</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert structured or record ndarray to DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : ndarray (structured dtype), list of tuples, dict, or DataFrame</span> |
| <span class="sd"> index : string, list of fields, array-like</span> |
| <span class="sd"> Field of array to use as the index, alternately a specific set of input labels to use</span> |
| <span class="sd"> exclude : sequence, default None</span> |
| <span class="sd"> Columns or fields to exclude</span> |
| <span class="sd"> columns : sequence, default None</span> |
| <span class="sd"> Column names to use. If the passed data do not have names associated with them, this</span> |
| <span class="sd"> argument provides names for the columns. Otherwise this argument indicates the order of</span> |
| <span class="sd"> the columns in the result (any names not found in the data will become all-NA columns)</span> |
| <span class="sd"> coerce_float : boolean, default False</span> |
| <span class="sd"> Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to</span> |
| <span class="sd"> floating point, useful for SQL result sets</span> |
| <span class="sd"> nrows : int, default None</span> |
| <span class="sd"> Number of rows to read if data is an iterator</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> df : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Use dict as input</span> |
| |
| <span class="sd"> >>> ps.DataFrame.from_records({'A': [1, 2, 3]})</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| |
| <span class="sd"> Use list of tuples as input</span> |
| |
| <span class="sd"> >>> ps.DataFrame.from_records([(1, 2), (3, 4)])</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 1 3 4</span> |
| |
| <span class="sd"> Use NumPy array as input</span> |
| |
| <span class="sd"> >>> ps.DataFrame.from_records(np.eye(3))</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 0 1.0 0.0 0.0</span> |
| <span class="sd"> 1 0.0 1.0 0.0</span> |
| <span class="sd"> 2 0.0 0.0 1.0</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">from_records</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">index</span><span class="p">,</span> <span class="n">exclude</span><span class="p">,</span> <span class="n">columns</span><span class="p">,</span> <span class="n">coerce_float</span><span class="p">,</span> <span class="n">nrows</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to_records"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.to_records.html#pyspark.pandas.DataFrame.to_records">[docs]</a> <span class="k">def</span> <span class="nf">to_records</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">column_dtypes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index_dtypes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">np</span><span class="o">.</span><span class="n">recarray</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert DataFrame to a NumPy record array.</span> |
| |
| <span class="sd"> Index will be included as the first field of the record array if</span> |
| <span class="sd"> requested.</span> |
| |
| <span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is</span> |
| <span class="sd"> expected to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index : bool, default True</span> |
| <span class="sd"> Include index in resulting record array, stored in 'index'</span> |
| <span class="sd"> field or using the index label, if set.</span> |
| <span class="sd"> column_dtypes : str, type, dict, default None</span> |
| <span class="sd"> If a string or type, the data type to store all columns. If</span> |
| <span class="sd"> a dictionary, a mapping of column names and indices (zero-indexed)</span> |
| <span class="sd"> to specific data types.</span> |
| <span class="sd"> index_dtypes : str, type, dict, default None</span> |
| <span class="sd"> If a string or type, the data type to store all index levels. If</span> |
| <span class="sd"> a dictionary, a mapping of index level names and indices</span> |
| <span class="sd"> (zero-indexed) to specific data types.</span> |
| <span class="sd"> This mapping is applied only if `index=True`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> numpy.recarray</span> |
| <span class="sd"> NumPy ndarray with the DataFrame labels as fields and each row</span> |
| <span class="sd"> of the DataFrame as entries.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.from_records: Convert structured or record ndarray</span> |
| <span class="sd"> to DataFrame.</span> |
| <span class="sd"> numpy.recarray: An ndarray that allows field access using</span> |
| <span class="sd"> attributes, analogous to typed columns in a</span> |
| <span class="sd"> spreadsheet.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},</span> |
| <span class="sd"> ... index=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> a 1 0.50</span> |
| <span class="sd"> b 2 0.75</span> |
| |
| <span class="sd"> >>> df.to_records() # doctest: +SKIP</span> |
| <span class="sd"> rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],</span> |
| <span class="sd"> dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])</span> |
| |
| <span class="sd"> The index can be excluded from the record array:</span> |
| |
| <span class="sd"> >>> df.to_records(index=False) # doctest: +SKIP</span> |
| <span class="sd"> rec.array([(1, 0.5 ), (2, 0.75)],</span> |
| <span class="sd"> dtype=[('A', '<i8'), ('B', '<f8')])</span> |
| |
| <span class="sd"> Specification of dtype for columns is new in pandas 0.24.0.</span> |
| <span class="sd"> Data types can be specified for the columns:</span> |
| |
| <span class="sd"> >>> df.to_records(column_dtypes={"A": "int32"}) # doctest: +SKIP</span> |
| <span class="sd"> rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],</span> |
| <span class="sd"> dtype=[('index', 'O'), ('A', '<i4'), ('B', '<f8')])</span> |
| |
| <span class="sd"> Specification of dtype for index is new in pandas 0.24.0.</span> |
| <span class="sd"> Data types can also be specified for the index:</span> |
| |
| <span class="sd"> >>> df.to_records(index_dtypes="<S2") # doctest: +SKIP</span> |
| <span class="sd"> rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],</span> |
| <span class="sd"> dtype=[('index', 'S2'), ('A', '<i8'), ('B', '<f8')])</span> |
| <span class="sd"> """</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_records</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_records</span><span class="p">,</span> <span class="n">args</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.copy"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.copy.html#pyspark.pandas.DataFrame.copy">[docs]</a> <span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">deep</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Make a copy of this object's indices and data.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> deep : bool, default True</span> |
| <span class="sd"> this parameter is not supported but just dummy parameter to match pandas.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> copy : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]},</span> |
| <span class="sd"> ... columns=['x', 'y', 'z', 'w'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> x y z w</span> |
| <span class="sd"> 0 1 3 5 7</span> |
| <span class="sd"> 1 2 4 6 8</span> |
| <span class="sd"> >>> df_copy = df.copy()</span> |
| <span class="sd"> >>> df_copy</span> |
| <span class="sd"> x y z w</span> |
| <span class="sd"> 0 1 3 5 7</span> |
| <span class="sd"> 1 2 4 6 8</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.dropna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.dropna.html#pyspark.pandas.DataFrame.dropna">[docs]</a> <span class="k">def</span> <span class="nf">dropna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"any"</span><span class="p">,</span> |
| <span class="n">thresh</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Remove missing values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or 'index'}, default 0</span> |
| <span class="sd"> Determine if rows or columns which contain missing values are</span> |
| <span class="sd"> removed.</span> |
| |
| <span class="sd"> * 0, or 'index' : Drop rows which contain missing values.</span> |
| <span class="sd"> how : {'any', 'all'}, default 'any'</span> |
| <span class="sd"> Determine if row or column is removed from DataFrame, when we have</span> |
| <span class="sd"> at least one NA or all NA.</span> |
| |
| <span class="sd"> * 'any' : If any NA values are present, drop that row or column.</span> |
| <span class="sd"> * 'all' : If all values are NA, drop that row or column.</span> |
| |
| <span class="sd"> thresh : int, optional</span> |
| <span class="sd"> Require that many non-NA values.</span> |
| <span class="sd"> subset : array-like, optional</span> |
| <span class="sd"> Labels along other axis to consider, e.g. if you are dropping rows</span> |
| <span class="sd"> these would be a list of columns to include.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> If True, do operation inplace and return None.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries dropped from it.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.drop : Drop specified labels from columns.</span> |
| <span class="sd"> DataFrame.isnull: Indicate missing values.</span> |
| <span class="sd"> DataFrame.notnull : Indicate existing (non-missing) values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],</span> |
| <span class="sd"> ... "toy": [None, 'Batmobile', 'Bullwhip'],</span> |
| <span class="sd"> ... "born": [None, "1940-04-25", None]},</span> |
| <span class="sd"> ... columns=['name', 'toy', 'born'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name toy born</span> |
| <span class="sd"> 0 Alfred None None</span> |
| <span class="sd"> 1 Batman Batmobile 1940-04-25</span> |
| <span class="sd"> 2 Catwoman Bullwhip None</span> |
| |
| <span class="sd"> Drop the rows where at least one element is missing.</span> |
| |
| <span class="sd"> >>> df.dropna()</span> |
| <span class="sd"> name toy born</span> |
| <span class="sd"> 1 Batman Batmobile 1940-04-25</span> |
| |
| <span class="sd"> Drop the columns where at least one element is missing.</span> |
| |
| <span class="sd"> >>> df.dropna(axis='columns')</span> |
| <span class="sd"> name</span> |
| <span class="sd"> 0 Alfred</span> |
| <span class="sd"> 1 Batman</span> |
| <span class="sd"> 2 Catwoman</span> |
| |
| <span class="sd"> Drop the rows where all elements are missing.</span> |
| |
| <span class="sd"> >>> df.dropna(how='all')</span> |
| <span class="sd"> name toy born</span> |
| <span class="sd"> 0 Alfred None None</span> |
| <span class="sd"> 1 Batman Batmobile 1940-04-25</span> |
| <span class="sd"> 2 Catwoman Bullwhip None</span> |
| |
| <span class="sd"> Keep only the rows with at least 2 non-NA values.</span> |
| |
| <span class="sd"> >>> df.dropna(thresh=2)</span> |
| <span class="sd"> name toy born</span> |
| <span class="sd"> 1 Batman Batmobile 1940-04-25</span> |
| <span class="sd"> 2 Catwoman Bullwhip None</span> |
| |
| <span class="sd"> Define in which columns to look for missing values.</span> |
| |
| <span class="sd"> >>> df.dropna(subset=['name', 'born'])</span> |
| <span class="sd"> name toy born</span> |
| <span class="sd"> 1 Batman Batmobile 1940-04-25</span> |
| |
| <span class="sd"> Keep the DataFrame with valid entries in the same variable.</span> |
| |
| <span class="sd"> >>> df.dropna(inplace=True)</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name toy born</span> |
| <span class="sd"> 1 Batman Batmobile 1940-04-25</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">thresh</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">how</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"must specify how or thresh"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"any"</span><span class="p">,</span> <span class="s2">"all"</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"invalid how option: </span><span class="si">{h}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">h</span><span class="o">=</span><span class="n">how</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">subset</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[(</span><span class="n">subset</span><span class="p">,)]</span> <span class="c1"># type: Optional[List[Label]]</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">subset</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">sub</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">sub</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">sub</span><span class="p">,)</span> <span class="k">for</span> <span class="n">sub</span> <span class="ow">in</span> <span class="n">subset</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">labels</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">invalids</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">invalids</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">invalids</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| |
| <span class="n">cnt</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="p">,</span> |
| <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">notna</span><span class="p">()</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span> |
| <span class="p">],</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">thresh</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">pred</span> <span class="o">=</span> <span class="n">cnt</span> <span class="o">>=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">thresh</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"any"</span><span class="p">:</span> |
| <span class="n">pred</span> <span class="o">=</span> <span class="n">cnt</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">labels</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"all"</span><span class="p">:</span> |
| <span class="n">pred</span> <span class="o">=</span> <span class="n">cnt</span> <span class="o">></span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">pred</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| |
| <span class="k">if</span> <span class="n">labels</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">lbl</span><span class="p">)</span> <span class="o">!=</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_level</span> <span class="k">for</span> <span class="n">lbl</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"The length of each subset must be the same as the index size."</span> |
| <span class="p">)</span> |
| |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">|</span> <span class="n">y</span><span class="p">,</span> |
| <span class="p">[</span> |
| <span class="n">reduce</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> |
| <span class="p">[</span> |
| <span class="n">scol</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">l</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">l</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">lbl</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">lbl</span> <span class="ow">in</span> <span class="n">labels</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="n">null_counts</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">null_counts</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="o">~</span><span class="n">cond</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="n">counts</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">null_counts</span> <span class="o">+</span> <span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="s2">"*"</span><span class="p">)])</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">thresh</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">cnt</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">counts</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">cnt</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">)</span> <span class="o">>=</span> <span class="nb">int</span><span class="p">(</span><span class="n">thresh</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"any"</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">cnt</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">counts</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">cnt</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">)</span> <span class="o">==</span> <span class="n">counts</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="p">]</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"all"</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">cnt</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">counts</span><span class="p">)</span> <span class="k">if</span> <span class="p">(</span><span class="n">cnt</span> <span class="ow">or</span> <span class="mi">0</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> |
| <span class="p">]</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="p">[</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <span class="c1"># TODO: add 'limit' when value parameter exists</span> |
| <div class="viewcode-block" id="DataFrame.fillna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.fillna.html#pyspark.pandas.DataFrame.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""Fill NA/NaN values.</span> |
| |
| <span class="sd"> .. note:: the current implementation of 'method' parameter in fillna uses Spark's Window</span> |
| <span class="sd"> without specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> value : scalar, dict, Series</span> |
| <span class="sd"> Value to use to fill holes. alternately a dict/Series of values</span> |
| <span class="sd"> specifying which value to use for each column.</span> |
| <span class="sd"> DataFrame is not supported.</span> |
| <span class="sd"> method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None</span> |
| <span class="sd"> Method to use for filling holes in reindexed Series pad / ffill: propagate last valid</span> |
| <span class="sd"> observation forward to next valid backfill / bfill:</span> |
| <span class="sd"> use NEXT valid observation to fill gap</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [None, 3, None, None],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 NaN 2.0 NaN 0</span> |
| <span class="sd"> 1 3.0 4.0 NaN 1</span> |
| <span class="sd"> 2 NaN NaN NaN 5</span> |
| <span class="sd"> 3 NaN 3.0 1.0 4</span> |
| |
| <span class="sd"> Replace all NaN elements with 0s.</span> |
| |
| <span class="sd"> >>> df.fillna(0)</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 0.0 2.0 0.0 0</span> |
| <span class="sd"> 1 3.0 4.0 0.0 1</span> |
| <span class="sd"> 2 0.0 0.0 0.0 5</span> |
| <span class="sd"> 3 0.0 3.0 1.0 4</span> |
| |
| <span class="sd"> We can also propagate non-null values forward or backward.</span> |
| |
| <span class="sd"> >>> df.fillna(method='ffill')</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 NaN 2.0 NaN 0</span> |
| <span class="sd"> 1 3.0 4.0 NaN 1</span> |
| <span class="sd"> 2 3.0 4.0 NaN 5</span> |
| <span class="sd"> 3 3.0 3.0 1.0 4</span> |
| |
| <span class="sd"> Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,</span> |
| <span class="sd"> 2, and 3 respectively.</span> |
| |
| <span class="sd"> >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}</span> |
| <span class="sd"> >>> df.fillna(value=values)</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 0.0 2.0 2.0 0</span> |
| <span class="sd"> 1 3.0 4.0 2.0 1</span> |
| <span class="sd"> 2 0.0 1.0 2.0 5</span> |
| <span class="sd"> 3 0.0 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"fillna currently only works for axis=0 or axis='index'"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">bool</span><span class="p">,</span> <span class="nb">dict</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">limit</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"limit parameter for value is not support now"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">to_dict</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="o">.</span><span class="n">values</span><span class="p">():</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">v</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">k</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">k</span><span class="p">,):</span> <span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> |
| |
| <span class="k">def</span> <span class="nf">op</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="k">if</span> <span class="n">k</span> <span class="o">==</span> <span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">k</span><span class="p">)]:</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span> |
| <span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">[</span><span class="n">k</span><span class="p">],</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">op</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span><span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">op</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span><span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must specify a fillna 'value' or 'method' parameter."</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">should_resolve</span><span class="o">=</span><span class="p">(</span><span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">))</span> |
| |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="n">requires_same_anchor</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.replace"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.replace.html#pyspark.pandas.DataFrame.replace">[docs]</a> <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">regex</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"pad"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a new DataFrame replacing a value with another value.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> to_replace : int, float, string, list, tuple or dict</span> |
| <span class="sd"> Value to be replaced.</span> |
| <span class="sd"> value : int, float, string, list or tuple</span> |
| <span class="sd"> Value to use to replace holes. The replacement value must be an int, float,</span> |
| <span class="sd"> or string.</span> |
| <span class="sd"> If value is a list or tuple, value should be of the same length with to_replace.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Object after replacement.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"name": ['Ironman', 'Captain America', 'Thor', 'Hulk'],</span> |
| <span class="sd"> ... "weapon": ['Mark-45', 'Shield', 'Mjolnir', 'Smash']},</span> |
| <span class="sd"> ... columns=['name', 'weapon'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name weapon</span> |
| <span class="sd"> 0 Ironman Mark-45</span> |
| <span class="sd"> 1 Captain America Shield</span> |
| <span class="sd"> 2 Thor Mjolnir</span> |
| <span class="sd"> 3 Hulk Smash</span> |
| |
| <span class="sd"> Scalar `to_replace` and `value`</span> |
| |
| <span class="sd"> >>> df.replace('Ironman', 'War-Machine')</span> |
| <span class="sd"> name weapon</span> |
| <span class="sd"> 0 War-Machine Mark-45</span> |
| <span class="sd"> 1 Captain America Shield</span> |
| <span class="sd"> 2 Thor Mjolnir</span> |
| <span class="sd"> 3 Hulk Smash</span> |
| |
| <span class="sd"> List like `to_replace` and `value`</span> |
| |
| <span class="sd"> >>> df.replace(['Ironman', 'Captain America'], ['Rescue', 'Hawkeye'], inplace=True)</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name weapon</span> |
| <span class="sd"> 0 Rescue Mark-45</span> |
| <span class="sd"> 1 Hawkeye Shield</span> |
| <span class="sd"> 2 Thor Mjolnir</span> |
| <span class="sd"> 3 Hulk Smash</span> |
| |
| <span class="sd"> Dicts can be used to specify different replacement values for different existing values</span> |
| <span class="sd"> To use a dict in this way the value parameter should be None</span> |
| |
| <span class="sd"> >>> df.replace({'Mjolnir': 'Stormbuster'})</span> |
| <span class="sd"> name weapon</span> |
| <span class="sd"> 0 Rescue Mark-45</span> |
| <span class="sd"> 1 Hawkeye Shield</span> |
| <span class="sd"> 2 Thor Stormbuster</span> |
| <span class="sd"> 3 Hulk Smash</span> |
| |
| <span class="sd"> Dict can specify that different values should be replaced in different columns</span> |
| <span class="sd"> The value parameter should not be None in this case</span> |
| |
| <span class="sd"> >>> df.replace({'weapon': 'Mjolnir'}, 'Stormbuster')</span> |
| <span class="sd"> name weapon</span> |
| <span class="sd"> 0 Rescue Mark-45</span> |
| <span class="sd"> 1 Hawkeye Shield</span> |
| <span class="sd"> 2 Thor Stormbuster</span> |
| <span class="sd"> 3 Hulk Smash</span> |
| |
| <span class="sd"> Nested dictionaries</span> |
| <span class="sd"> The value parameter should be None to use a nested dict in this way</span> |
| |
| <span class="sd"> >>> df.replace({'weapon': {'Mjolnir': 'Stormbuster'}})</span> |
| <span class="sd"> name weapon</span> |
| <span class="sd"> 0 Rescue Mark-45</span> |
| <span class="sd"> 1 Hawkeye Shield</span> |
| <span class="sd"> 2 Thor Stormbuster</span> |
| <span class="sd"> 3 Hulk Smash</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">method</span> <span class="o">!=</span> <span class="s2">"pad"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"replace currently works only for method='pad"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">limit</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"replace currently works only when limit=None"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">regex</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"replace currently doesn't supports regex"</span><span class="p">)</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">to_replace</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">to_replace</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">to_replace</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">))</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">to_replace</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Length of to_replace and value must be same"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">to_replace</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">to_replace</span><span class="o">.</span><span class="n">values</span><span class="p">())</span> |
| <span class="p">):</span> |
| <span class="n">to_replace_dict</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="n">to_replace</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">op</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">psser</span><span class="o">.</span><span class="n">name</span> <span class="ow">in</span> <span class="n">to_replace_dict</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span> |
| <span class="n">to_replace</span><span class="o">=</span><span class="n">to_replace_dict</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">],</span> <span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">regex</span><span class="o">=</span><span class="n">regex</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">op</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">to_replace</span><span class="o">=</span><span class="n">to_replace</span><span class="p">,</span> <span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">regex</span><span class="o">=</span><span class="n">regex</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="n">op</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.clip"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.clip.html#pyspark.pandas.DataFrame.clip">[docs]</a> <span class="k">def</span> <span class="nf">clip</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">lower</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">upper</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Trim values at input threshold(s).</span> |
| |
| <span class="sd"> Assigns values outside boundary to boundary values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> lower : float or int, default None</span> |
| <span class="sd"> Minimum threshold value. All values below this threshold will be set to it.</span> |
| <span class="sd"> upper : float or int, default None</span> |
| <span class="sd"> Maximum threshold value. All values above this threshold will be set to it.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with the values outside the clip boundaries replaced.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.DataFrame({'A': [0, 2, 4]}).clip(1, 3)</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> One difference between this implementation and pandas is that running</span> |
| <span class="sd"> pd.DataFrame({'A': ['a', 'b']}).clip(0, 1) will crash with "TypeError: '<=' not supported</span> |
| <span class="sd"> between instances of 'str' and 'int'" while ps.DataFrame({'A': ['a', 'b']}).clip(0, 1)</span> |
| <span class="sd"> will output the original DataFrame, simply ignoring the incompatible types.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">lower</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">upper</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"List-like value are not supported for 'lower' and 'upper' at the "</span> <span class="o">+</span> <span class="s2">"moment"</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">lower</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">upper</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">clip</span><span class="p">(</span><span class="n">lower</span><span class="o">=</span><span class="n">lower</span><span class="p">,</span> <span class="n">upper</span><span class="o">=</span><span class="n">upper</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.head"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.head.html#pyspark.pandas.DataFrame.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the first `n` rows.</span> |
| |
| <span class="sd"> This function returns the first `n` rows for the object based</span> |
| <span class="sd"> on position. It is useful for quickly testing if your object</span> |
| <span class="sd"> has the right type of data in it.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, default 5</span> |
| <span class="sd"> Number of rows to select.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> obj_head : same type as caller</span> |
| <span class="sd"> The first `n` rows of the caller object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion',</span> |
| <span class="sd"> ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 0 alligator</span> |
| <span class="sd"> 1 bee</span> |
| <span class="sd"> 2 falcon</span> |
| <span class="sd"> 3 lion</span> |
| <span class="sd"> 4 monkey</span> |
| <span class="sd"> 5 parrot</span> |
| <span class="sd"> 6 shark</span> |
| <span class="sd"> 7 whale</span> |
| <span class="sd"> 8 zebra</span> |
| |
| <span class="sd"> Viewing the first 5 lines</span> |
| |
| <span class="sd"> >>> df.head()</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 0 alligator</span> |
| <span class="sd"> 1 bee</span> |
| <span class="sd"> 2 falcon</span> |
| <span class="sd"> 3 lion</span> |
| <span class="sd"> 4 monkey</span> |
| |
| <span class="sd"> Viewing the first `n` lines (three in this case)</span> |
| |
| <span class="sd"> >>> df.head(3)</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 0 alligator</span> |
| <span class="sd"> 1 bee</span> |
| <span class="sd"> 2 falcon</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">n</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">+</span> <span class="n">n</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o"><=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">if</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.ordered_head"</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">n</span><span class="p">)))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.last"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.last.html#pyspark.pandas.DataFrame.last">[docs]</a> <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select final periods of time series data based on a date offset.</span> |
| |
| <span class="sd"> When having a DataFrame with dates as index, this function can</span> |
| <span class="sd"> select the last few rows based on a date offset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> offset : str or DateOffset</span> |
| <span class="sd"> The offset length of the data that will be selected. For instance,</span> |
| <span class="sd"> '3D' will display all the rows having their index within the last 3 days.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A subset of the caller.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=index)</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 1</span> |
| <span class="sd"> 2018-04-11 2</span> |
| <span class="sd"> 2018-04-13 3</span> |
| <span class="sd"> 2018-04-15 4</span> |
| |
| <span class="sd"> Get the rows for the last 3 days:</span> |
| |
| <span class="sd"> >>> psdf.last('3D')</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-13 3</span> |
| <span class="sd"> 2018-04-15 4</span> |
| |
| <span class="sd"> Notice the data for 3 last calendar days were returned, not the last</span> |
| <span class="sd"> 3 observed days in the dataset, and therefore data for 2018-04-11 was</span> |
| <span class="sd"> not returned.</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Check index type should be format DateTime</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DatetimeIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"'last' only supports a DatetimeIndex"</span><span class="p">)</span> |
| |
| <span class="n">offset</span> <span class="o">=</span> <span class="n">to_offset</span><span class="p">(</span><span class="n">offset</span><span class="p">)</span> |
| <span class="n">from_date</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">max</span><span class="p">()</span> <span class="o">-</span> <span class="n">offset</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">from_date</span><span class="p">:])</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.first.html#pyspark.pandas.DataFrame.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Select first periods of time series data based on a date offset.</span> |
| |
| <span class="sd"> When having a DataFrame with dates as index, this function can</span> |
| <span class="sd"> select the first few rows based on a date offset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> offset : str or DateOffset</span> |
| <span class="sd"> The offset length of the data that will be selected. For instance,</span> |
| <span class="sd"> '3D' will display all the rows having their index within the first 3 days.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A subset of the caller.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> TypeError</span> |
| <span class="sd"> If the index is not a :class:`DatetimeIndex`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=index)</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 1</span> |
| <span class="sd"> 2018-04-11 2</span> |
| <span class="sd"> 2018-04-13 3</span> |
| <span class="sd"> 2018-04-15 4</span> |
| |
| <span class="sd"> Get the rows for the last 3 days:</span> |
| |
| <span class="sd"> >>> psdf.first('3D')</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 2018-04-09 1</span> |
| <span class="sd"> 2018-04-11 2</span> |
| |
| <span class="sd"> Notice the data for 3 first calendar days were returned, not the first</span> |
| <span class="sd"> 3 observed days in the dataset, and therefore data for 2018-04-13 was</span> |
| <span class="sd"> not returned.</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Check index type should be format DatetimeIndex</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DatetimeIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"'first' only supports a DatetimeIndex"</span><span class="p">)</span> |
| |
| <span class="n">offset</span> <span class="o">=</span> <span class="n">to_offset</span><span class="p">(</span><span class="n">offset</span><span class="p">)</span> |
| <span class="n">to_date</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">min</span><span class="p">()</span> <span class="o">+</span> <span class="n">offset</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:</span><span class="n">to_date</span><span class="p">])</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.pivot_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.pivot_table.html#pyspark.pandas.DataFrame.pivot_table">[docs]</a> <span class="k">def</span> <span class="nf">pivot_table</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">aggfunc</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="s2">"mean"</span><span class="p">,</span> |
| <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Create a spreadsheet-style pivot table as a DataFrame. The levels in</span> |
| <span class="sd"> the pivot table will be stored in MultiIndex objects (hierarchical</span> |
| <span class="sd"> indexes) on the index and columns of the result DataFrame.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> values : column to aggregate.</span> |
| <span class="sd"> They should be either a list less than three or a string.</span> |
| <span class="sd"> index : column (string) or list of columns</span> |
| <span class="sd"> If an array is passed, it must be the same length as the data.</span> |
| <span class="sd"> The list should contain string.</span> |
| <span class="sd"> columns : column</span> |
| <span class="sd"> Columns used in the pivot operation. Only one column is supported and</span> |
| <span class="sd"> it should be a string.</span> |
| <span class="sd"> aggfunc : function (string), dict, default mean</span> |
| <span class="sd"> If dict is passed, the key is column to aggregate and value</span> |
| <span class="sd"> is function or list of functions.</span> |
| <span class="sd"> fill_value : scalar, default None</span> |
| <span class="sd"> Value to replace missing values with.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> table : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",</span> |
| <span class="sd"> ... "bar", "bar", "bar", "bar"],</span> |
| <span class="sd"> ... "B": ["one", "one", "one", "two", "two",</span> |
| <span class="sd"> ... "one", "one", "two", "two"],</span> |
| <span class="sd"> ... "C": ["small", "large", "large", "small",</span> |
| <span class="sd"> ... "small", "large", "small", "small",</span> |
| <span class="sd"> ... "large"],</span> |
| <span class="sd"> ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],</span> |
| <span class="sd"> ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]},</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D', 'E'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D E</span> |
| <span class="sd"> 0 foo one small 1 2</span> |
| <span class="sd"> 1 foo one large 2 4</span> |
| <span class="sd"> 2 foo one large 2 5</span> |
| <span class="sd"> 3 foo two small 3 5</span> |
| <span class="sd"> 4 foo two small 3 6</span> |
| <span class="sd"> 5 bar one large 4 6</span> |
| <span class="sd"> 6 bar one small 5 8</span> |
| <span class="sd"> 7 bar two small 6 9</span> |
| <span class="sd"> 8 bar two large 7 9</span> |
| |
| <span class="sd"> This first example aggregates values by taking the sum.</span> |
| |
| <span class="sd"> >>> table = df.pivot_table(values='D', index=['A', 'B'],</span> |
| <span class="sd"> ... columns='C', aggfunc='sum')</span> |
| <span class="sd"> >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> C large small</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> bar one 4.0 5</span> |
| <span class="sd"> two 7.0 6</span> |
| <span class="sd"> foo one 4.0 1</span> |
| <span class="sd"> two NaN 6</span> |
| |
| <span class="sd"> We can also fill missing values using the `fill_value` parameter.</span> |
| |
| <span class="sd"> >>> table = df.pivot_table(values='D', index=['A', 'B'],</span> |
| <span class="sd"> ... columns='C', aggfunc='sum', fill_value=0)</span> |
| <span class="sd"> >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> C large small</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> bar one 4 5</span> |
| <span class="sd"> two 7 6</span> |
| <span class="sd"> foo one 4 1</span> |
| <span class="sd"> two 0 6</span> |
| |
| <span class="sd"> We can also calculate multiple types of aggregations for any given</span> |
| <span class="sd"> value column.</span> |
| |
| <span class="sd"> >>> table = df.pivot_table(values=['D'], index =['C'],</span> |
| <span class="sd"> ... columns="A", aggfunc={'D': 'mean'})</span> |
| <span class="sd"> >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> D</span> |
| <span class="sd"> A bar foo</span> |
| <span class="sd"> C</span> |
| <span class="sd"> large 5.5 2.000000</span> |
| <span class="sd"> small 5.5 2.333333</span> |
| |
| <span class="sd"> The next example aggregates on multiple values.</span> |
| |
| <span class="sd"> >>> table = df.pivot_table(index=['C'], columns="A", values=['D', 'E'],</span> |
| <span class="sd"> ... aggfunc={'D': 'mean', 'E': 'sum'})</span> |
| <span class="sd"> >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> D E</span> |
| <span class="sd"> A bar foo bar foo</span> |
| <span class="sd"> C</span> |
| <span class="sd"> large 5.5 2.000000 15 9</span> |
| <span class="sd"> small 5.5 2.333333 17 13</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"columns should be one column name."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="n">is_name_like_value</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">values</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"values should be one column or list of columns."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> |
| <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span> |
| <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">aggfunc</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"aggfunc must be a dict mapping from column name "</span> |
| <span class="s2">"to aggregate functions (string)."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"pivot_table doesn't support aggfunc"</span> <span class="s2">" as dict and without index."</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="ow">and</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"values can't be a list without index."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Wrong columns </span><span class="si">{}</span><span class="s2">."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">columns</span><span class="p">)))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">(</span><span class="n">columns</span><span class="p">,)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">values</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">NumericType</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">values</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"values should be a numeric type."</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="n">values</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">values</span><span class="p">,)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">values</span><span class="p">),</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"values should be a numeric type."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span> |
| <span class="s2">"</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{0}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">value</span><span class="p">),</span> <span class="n">aggfunc</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">values</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span> |
| <span class="s2">"</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{0}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">values</span><span class="p">),</span> <span class="n">aggfunc</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">aggfunc</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">key</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">key</span><span class="p">,):</span> <span class="n">value</span> <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">aggfunc</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">}</span> |
| <span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span> |
| <span class="s2">"</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{0}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">key</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">aggfunc</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">]</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">key</span> <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">aggfunc</span><span class="o">.</span><span class="n">items</span><span class="p">()]</span> |
| |
| <span class="k">if</span> <span class="nb">set</span><span class="p">(</span><span class="n">agg_columns</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">set</span><span class="p">(</span><span class="n">values</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Columns in aggfunc must be the same as values."</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">groupBy</span><span class="p">()</span> |
| <span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">pivot_col</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">agg_cols</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">label</span><span class="p">,)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index</span><span class="p">]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">groupBy</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index</span><span class="p">])</span> |
| <span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">pivot_col</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">agg_cols</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"index should be a None or a list of columns."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">fill_value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">fill_value</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">)):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">fill_value</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">index_columns</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index</span><span class="p">]</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">field_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">column</span> <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span> <span class="k">if</span> <span class="n">column</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">index_columns</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="c1"># If we have two values, Spark will return column's name</span> |
| <span class="c1"># in this format: column_values, where column contains</span> |
| <span class="c1"># their values in the DataFrame and values is</span> |
| <span class="c1"># the column list passed to the pivot_table().</span> |
| <span class="c1"># E.g. if column is b and values is ['b','e'],</span> |
| <span class="c1"># then ['2_b', '2_e', '3_b', '3_e'].</span> |
| |
| <span class="c1"># We sort the columns of Spark DataFrame by values.</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">"_"</span><span class="p">,</span> <span class="mi">1</span><span class="p">)[</span><span class="mi">1</span><span class="p">])</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">index_columns</span> <span class="o">+</span> <span class="n">data_columns</span><span class="p">)</span> |
| |
| <span class="n">column_name_to_index</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span> |
| <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">column_name_to_index</span><span class="p">[</span><span class="n">name</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">"_"</span><span class="p">)[</span><span class="mi">1</span><span class="p">]])</span> <span class="o">+</span> <span class="p">[</span><span class="n">name</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">"_"</span><span class="p">)[</span><span class="mi">0</span><span class="p">]])</span> |
| <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">data_columns</span> |
| <span class="p">]</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">],</span> <span class="kc">None</span><span class="p">)]</span> <span class="o">*</span> <span class="n">column_labels_level</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> |
| <span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_columns</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> <span class="c1"># type: "DataFrame"</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="p">[</span><span class="n">column</span><span class="p">])</span> <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">]</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="p">([</span><span class="n">cast</span><span class="p">(</span><span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">],</span> <span class="kc">None</span><span class="p">)]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span> <span class="o">+</span> <span class="p">[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_columns</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_columns</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="p">[</span><span class="n">columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">index_values</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_values</span> <span class="o">=</span> <span class="n">values</span> |
| <span class="n">index_map</span> <span class="o">=</span> <span class="n">OrderedDict</span><span class="p">()</span> <span class="c1"># type: Dict[str, Optional[Label]]</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">index_value</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">index_values</span><span class="p">):</span> |
| <span class="n">colname</span> <span class="o">=</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">colname</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">index_value</span><span class="p">))</span> |
| <span class="n">index_map</span><span class="p">[</span><span class="n">colname</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_map</span><span class="o">.</span><span class="n">keys</span><span class="p">()],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_map</span><span class="o">.</span><span class="n">values</span><span class="p">()),</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="p">[</span><span class="n">columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="n">psdf_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psdf_columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">psdf_columns</span><span class="o">.</span><span class="n">set_levels</span><span class="p">(</span> |
| <span class="n">psdf_columns</span><span class="o">.</span><span class="n">levels</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">)</span> |
| <span class="p">),</span> |
| <span class="n">level</span><span class="o">=-</span><span class="mi">1</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">psdf_columns</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.pivot"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.pivot.html#pyspark.pandas.DataFrame.pivot">[docs]</a> <span class="k">def</span> <span class="nf">pivot</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return reshaped DataFrame organized by given index / column values.</span> |
| |
| <span class="sd"> Reshape data (produce a "pivot" table) based on column values. Uses</span> |
| <span class="sd"> unique values from specified `index` / `columns` to form axes of the</span> |
| <span class="sd"> resulting DataFrame. This function does not support data</span> |
| <span class="sd"> aggregation.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index : string, optional</span> |
| <span class="sd"> Column to use to make new frame's index. If None, uses</span> |
| <span class="sd"> existing index.</span> |
| <span class="sd"> columns : string</span> |
| <span class="sd"> Column to use to make new frame's columns.</span> |
| <span class="sd"> values : string, object or a list of the previous</span> |
| <span class="sd"> Column(s) to use for populating new frame's values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Returns reshaped DataFrame.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.pivot_table : Generalization of pivot that can handle</span> |
| <span class="sd"> duplicate values for one index/column pair.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',</span> |
| <span class="sd"> ... 'two'],</span> |
| <span class="sd"> ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],</span> |
| <span class="sd"> ... 'baz': [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']},</span> |
| <span class="sd"> ... columns=['foo', 'bar', 'baz', 'zoo'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> foo bar baz zoo</span> |
| <span class="sd"> 0 one A 1 x</span> |
| <span class="sd"> 1 one B 2 y</span> |
| <span class="sd"> 2 one C 3 z</span> |
| <span class="sd"> 3 two A 4 q</span> |
| <span class="sd"> 4 two B 5 w</span> |
| <span class="sd"> 5 two C 6 t</span> |
| |
| <span class="sd"> >>> df.pivot(index='foo', columns='bar', values='baz').sort_index()</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> bar A B C</span> |
| <span class="sd"> foo</span> |
| <span class="sd"> one 1 2 3</span> |
| <span class="sd"> two 4 5 6</span> |
| |
| <span class="sd"> >>> df.pivot(columns='bar', values='baz').sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> bar A B C</span> |
| <span class="sd"> 0 1.0 NaN NaN</span> |
| <span class="sd"> 1 NaN 2.0 NaN</span> |
| <span class="sd"> 2 NaN NaN 3.0</span> |
| <span class="sd"> 3 4.0 NaN NaN</span> |
| <span class="sd"> 4 NaN 5.0 NaN</span> |
| <span class="sd"> 5 NaN NaN 6.0</span> |
| |
| <span class="sd"> Notice that, unlike pandas raises an ValueError when duplicated values are found,</span> |
| <span class="sd"> pandas-on-Spark's pivot still works with its first value it meets during operation because</span> |
| <span class="sd"> pivot is an expensive operation and it is preferred to permissively execute over failing</span> |
| <span class="sd"> fast when processing large data.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({"foo": ['one', 'one', 'two', 'two'],</span> |
| <span class="sd"> ... "bar": ['A', 'A', 'B', 'C'],</span> |
| <span class="sd"> ... "baz": [1, 2, 3, 4]}, columns=['foo', 'bar', 'baz'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> foo bar baz</span> |
| <span class="sd"> 0 one A 1</span> |
| <span class="sd"> 1 one A 2</span> |
| <span class="sd"> 2 two B 3</span> |
| <span class="sd"> 3 two C 4</span> |
| |
| <span class="sd"> >>> df.pivot(index='foo', columns='bar', values='baz').sort_index()</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> bar A B C</span> |
| <span class="sd"> foo</span> |
| <span class="sd"> one 1.0 NaN NaN</span> |
| <span class="sd"> two NaN 3.0 4.0</span> |
| |
| <span class="sd"> It also support multi-index and multi-index column.</span> |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'baz')])</span> |
| |
| <span class="sd"> >>> df = df.set_index(('a', 'bar'), append=True)</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> foo baz</span> |
| <span class="sd"> (a, bar)</span> |
| <span class="sd"> 0 A one 1</span> |
| <span class="sd"> 1 A one 2</span> |
| <span class="sd"> 2 B two 3</span> |
| <span class="sd"> 3 C two 4</span> |
| |
| <span class="sd"> >>> df.pivot(columns=('a', 'foo'), values=('b', 'baz')).sort_index()</span> |
| <span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> ('a', 'foo') one two</span> |
| <span class="sd"> (a, bar)</span> |
| <span class="sd"> 0 A 1.0 NaN</span> |
| <span class="sd"> 1 A 2.0 NaN</span> |
| <span class="sd"> 2 B NaN 3.0</span> |
| <span class="sd"> 3 C NaN 4.0</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"columns should be set."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">values</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"values should be set."</span><span class="p">)</span> |
| |
| <span class="n">should_use_existing_index</span> <span class="o">=</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">should_use_existing_index</span><span class="p">:</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="n">index_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">index</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># The index after `reset_index()` will never be used, so use "distributed" index</span> |
| <span class="c1"># as a dummy to avoid overhead.</span> |
| <span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.default_index_type"</span><span class="p">,</span> <span class="s2">"distributed"</span><span class="p">):</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="n">index_labels</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">]</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_labels</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">values</span><span class="o">=</span><span class="n">values</span><span class="p">,</span> <span class="n">aggfunc</span><span class="o">=</span><span class="s2">"first"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_use_existing_index</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">df</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">columns</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">:</span> |
| <span class="sd">"""The column labels of the DataFrame."""</span> |
| <span class="n">names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">name</span> <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span> <span class="k">else</span> <span class="n">name</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="o">.</span><span class="n">from_tuples</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">([</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">],</span> <span class="n">name</span><span class="o">=</span><span class="n">names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">return</span> <span class="n">columns</span> |
| |
| <span class="nd">@columns</span><span class="o">.</span><span class="n">setter</span> |
| <span class="k">def</span> <span class="nf">columns</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">columns</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">col</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">allow_none</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Length mismatch: Expected axis has </span><span class="si">{}</span><span class="s2"> elements, "</span> |
| <span class="s2">"new values have </span><span class="si">{}</span><span class="s2"> elements"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">):</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">name</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">columns</span><span class="o">.</span><span class="n">names</span> |
| <span class="p">]</span> <span class="c1"># type: Optional[List]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">pssers</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">pssers</span><span class="p">,</span> <span class="n">column_label_names</span><span class="o">=</span><span class="n">column_label_names</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""Return the dtypes in the DataFrame.</span> |
| |
| <span class="sd"> This returns a Series with the data type of each column. The result's index is the original</span> |
| <span class="sd"> DataFrame's columns. Columns with mixed types are stored with the object dtype.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> pd.Series</span> |
| <span class="sd"> The data type of each column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': list('abc'),</span> |
| <span class="sd"> ... 'b': list(range(1, 4)),</span> |
| <span class="sd"> ... 'c': np.arange(3, 6).astype('i1'),</span> |
| <span class="sd"> ... 'd': np.arange(4.0, 7.0, dtype='float64'),</span> |
| <span class="sd"> ... 'e': [True, False, True],</span> |
| <span class="sd"> ... 'f': pd.date_range('20130101', periods=3)},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c', 'd', 'e', 'f'])</span> |
| <span class="sd"> >>> df.dtypes</span> |
| <span class="sd"> a object</span> |
| <span class="sd"> b int64</span> |
| <span class="sd"> c int8</span> |
| <span class="sd"> d float64</span> |
| <span class="sd"> e bool</span> |
| <span class="sd"> f datetime64[ns]</span> |
| <span class="sd"> dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span> |
| <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">],</span> |
| <span class="n">index</span><span class="o">=</span><span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.select_dtypes"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.select_dtypes.html#pyspark.pandas.DataFrame.select_dtypes">[docs]</a> <span class="k">def</span> <span class="nf">select_dtypes</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">include</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">exclude</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a subset of the DataFrame's columns based on the column dtypes.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> include, exclude : scalar or list-like</span> |
| <span class="sd"> A selection of dtypes or strings to be included/excluded. At least</span> |
| <span class="sd"> one of these parameters must be supplied. It also takes Spark SQL</span> |
| <span class="sd"> DDL type strings, for instance, 'string' and 'date'.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> The subset of the frame including the dtypes in ``include`` and</span> |
| <span class="sd"> excluding the dtypes in ``exclude``.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> ValueError</span> |
| <span class="sd"> * If both of ``include`` and ``exclude`` are empty</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2] * 3,</span> |
| <span class="sd"> ... 'b': [True, False] * 3,</span> |
| <span class="sd"> ... 'c': [1.0, 2.0] * 3})</span> |
| <span class="sd"> >>> df.select_dtypes()</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: at least one of include or exclude must be nonempty</span> |
| |
| <span class="sd"> * If ``include`` and ``exclude`` have overlapping elements</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2] * 3,</span> |
| <span class="sd"> ... 'b': [True, False] * 3,</span> |
| <span class="sd"> ... 'c': [1.0, 2.0] * 3})</span> |
| <span class="sd"> >>> df.select_dtypes(include='a', exclude='a')</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: include and exclude overlap on {'a'}</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> * To select datetimes, use ``np.datetime64``, ``'datetime'`` or</span> |
| <span class="sd"> ``'datetime64'``</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2] * 3,</span> |
| <span class="sd"> ... 'b': [True, False] * 3,</span> |
| <span class="sd"> ... 'c': [1.0, 2.0] * 3,</span> |
| <span class="sd"> ... 'd': ['a', 'b'] * 3}, columns=['a', 'b', 'c', 'd'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c d</span> |
| <span class="sd"> 0 1 True 1.0 a</span> |
| <span class="sd"> 1 2 False 2.0 b</span> |
| <span class="sd"> 2 1 True 1.0 a</span> |
| <span class="sd"> 3 2 False 2.0 b</span> |
| <span class="sd"> 4 1 True 1.0 a</span> |
| <span class="sd"> 5 2 False 2.0 b</span> |
| |
| <span class="sd"> >>> df.select_dtypes(include='bool')</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 True</span> |
| <span class="sd"> 1 False</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 True</span> |
| <span class="sd"> 5 False</span> |
| |
| <span class="sd"> >>> df.select_dtypes(include=['float64'], exclude=['int'])</span> |
| <span class="sd"> c</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.0</span> |
| <span class="sd"> 2 1.0</span> |
| <span class="sd"> 3 2.0</span> |
| <span class="sd"> 4 1.0</span> |
| <span class="sd"> 5 2.0</span> |
| |
| <span class="sd"> >>> df.select_dtypes(exclude=['int'])</span> |
| <span class="sd"> b c d</span> |
| <span class="sd"> 0 True 1.0 a</span> |
| <span class="sd"> 1 False 2.0 b</span> |
| <span class="sd"> 2 True 1.0 a</span> |
| <span class="sd"> 3 False 2.0 b</span> |
| <span class="sd"> 4 True 1.0 a</span> |
| <span class="sd"> 5 False 2.0 b</span> |
| |
| <span class="sd"> Spark SQL DDL type strings can be used as well.</span> |
| |
| <span class="sd"> >>> df.select_dtypes(exclude=['string'])</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 True 1.0</span> |
| <span class="sd"> 1 2 False 2.0</span> |
| <span class="sd"> 2 1 True 1.0</span> |
| <span class="sd"> 3 2 False 2.0</span> |
| <span class="sd"> 4 1 True 1.0</span> |
| <span class="sd"> 5 2 False 2.0</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">_parse_datatype_string</span> <span class="c1"># type: ignore</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">include</span><span class="p">):</span> |
| <span class="n">include_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">include</span><span class="p">]</span> <span class="k">if</span> <span class="n">include</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">include_list</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">include</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">exclude</span><span class="p">):</span> |
| <span class="n">exclude_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">exclude</span><span class="p">]</span> <span class="k">if</span> <span class="n">exclude</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">exclude_list</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">exclude</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">((</span><span class="n">include_list</span><span class="p">,</span> <span class="n">exclude_list</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"at least one of include or exclude must be "</span> <span class="s2">"nonempty"</span><span class="p">)</span> |
| |
| <span class="c1"># can't both include AND exclude!</span> |
| <span class="k">if</span> <span class="nb">set</span><span class="p">(</span><span class="n">include_list</span><span class="p">)</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">exclude_list</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"include and exclude overlap on </span><span class="si">{inc_ex}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">inc_ex</span><span class="o">=</span><span class="nb">set</span><span class="p">(</span><span class="n">include_list</span><span class="p">)</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">exclude_list</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Handle Spark types</span> |
| <span class="n">include_spark_type</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">inc</span> <span class="ow">in</span> <span class="n">include_list</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">include_spark_type</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">_parse_datatype_string</span><span class="p">(</span><span class="n">inc</span><span class="p">))</span> |
| <span class="k">except</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="n">exclude_spark_type</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">exc</span> <span class="ow">in</span> <span class="n">exclude_list</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">exclude_spark_type</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">_parse_datatype_string</span><span class="p">(</span><span class="n">exc</span><span class="p">))</span> |
| <span class="k">except</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="c1"># Handle pandas types</span> |
| <span class="n">include_numpy_type</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">inc</span> <span class="ow">in</span> <span class="n">include_list</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">include_numpy_type</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">infer_dtype_from_object</span><span class="p">(</span><span class="n">inc</span><span class="p">))</span> |
| <span class="k">except</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="n">exclude_numpy_type</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">exc</span> <span class="ow">in</span> <span class="n">exclude_list</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">exclude_numpy_type</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">infer_dtype_from_object</span><span class="p">(</span><span class="n">exc</span><span class="p">))</span> |
| <span class="k">except</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">include_list</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">should_include</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">infer_dtype_from_object</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">include_numpy_type</span> |
| <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="ow">in</span> <span class="n">include_spark_type</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">should_include</span> <span class="o">=</span> <span class="ow">not</span> <span class="p">(</span> |
| <span class="n">infer_dtype_from_object</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">exclude_numpy_type</span> |
| <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="ow">in</span> <span class="n">exclude_spark_type</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_include</span><span class="p">:</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">])</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.droplevel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.droplevel.html#pyspark.pandas.DataFrame.droplevel">[docs]</a> <span class="k">def</span> <span class="nf">droplevel</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]]],</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return DataFrame with requested index / column level(s) removed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> level: int, str, or list-like</span> |
| <span class="sd"> If a string is given, must be the name of a level If list-like, elements must</span> |
| <span class="sd"> be names or positional indexes of levels.</span> |
| |
| <span class="sd"> axis: {0 or ‘index’, 1 or ‘columns’}, default 0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame with requested index / column level(s) removed.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[3, 4], [7, 8], [11, 12]],</span> |
| <span class="sd"> ... index=pd.MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"]),</span> |
| <span class="sd"> ... )</span> |
| |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples([</span> |
| <span class="sd"> ... ('c', 'e'), ('d', 'f')</span> |
| <span class="sd"> ... ], names=['level_1', 'level_2'])</span> |
| |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> level_1 c d</span> |
| <span class="sd"> level_2 e f</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 1 2 3 4</span> |
| <span class="sd"> 5 6 7 8</span> |
| <span class="sd"> 9 10 11 12</span> |
| |
| <span class="sd"> >>> df.droplevel('a') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> level_1 c d</span> |
| <span class="sd"> level_2 e f</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 2 3 4</span> |
| <span class="sd"> 6 7 8</span> |
| <span class="sd"> 10 11 12</span> |
| |
| <span class="sd"> >>> df.droplevel('level_2', axis=1) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> level_1 c d</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 1 2 3 4</span> |
| <span class="sd"> 5 6 7 8</span> |
| <span class="sd"> 9 10 11 12</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="p">(</span><span class="nb">tuple</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span> <span class="c1"># huh?</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="p">[</span><span class="n">level</span><span class="p">]</span> |
| |
| <span class="n">names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> |
| <span class="n">nlevels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> |
| |
| <span class="n">int_level</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">n</span> <span class="ow">in</span> <span class="n">level</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">n</span> <span class="o">=</span> <span class="n">n</span> <span class="o">+</span> <span class="n">nlevels</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">{}</span><span class="s2"> levels, "</span> |
| <span class="s2">"</span><span class="si">{}</span><span class="s2"> is not a valid level number"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">nlevels</span><span class="p">,</span> <span class="p">(</span><span class="n">n</span> <span class="o">-</span> <span class="n">nlevels</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o">>=</span> <span class="n">nlevels</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">{}</span><span class="s2"> levels, not </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">nlevels</span><span class="p">,</span> <span class="p">(</span><span class="n">n</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"Level </span><span class="si">{}</span><span class="s2"> not found"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">n</span><span class="p">))</span> |
| <span class="n">n</span> <span class="o">=</span> <span class="n">names</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> |
| <span class="n">int_level</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">level</span><span class="p">)</span> <span class="o">>=</span> <span class="n">nlevels</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Cannot remove </span><span class="si">{}</span><span class="s2"> levels from an index with </span><span class="si">{}</span><span class="s2"> levels: "</span> |
| <span class="s2">"at least one level must be left."</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">level</span><span class="p">),</span> <span class="n">nlevels</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="p">,</span> <span class="n">index_fields</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">item</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">item</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span> |
| <span class="nb">zip</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">i</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">int_level</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_spark_columns</span><span class="p">),</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_names</span><span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_fields</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">droplevel</span><span class="p">(</span><span class="n">level</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.drop"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.drop.html#pyspark.pandas.DataFrame.drop">[docs]</a> <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Drop specified labels from columns.</span> |
| |
| <span class="sd"> Remove columns by specifying label names and axis=1 or columns.</span> |
| <span class="sd"> When specifying both labels and columns, only labels will be dropped.</span> |
| <span class="sd"> Removing rows is yet to be implemented.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> labels : single label or list-like</span> |
| <span class="sd"> Column labels to drop.</span> |
| <span class="sd"> axis : {1 or 'columns'}, default 1</span> |
| <span class="sd"> .. dropna currently only works for axis=1 'columns'</span> |
| <span class="sd"> axis=0 is yet to be implemented.</span> |
| <span class="sd"> columns : single label or list-like</span> |
| <span class="sd"> Alternative to specifying axis (``labels, axis=1``</span> |
| <span class="sd"> is equivalent to ``columns=labels``).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> dropped : DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.dropna</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]},</span> |
| <span class="sd"> ... columns=['x', 'y', 'z', 'w'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> x y z w</span> |
| <span class="sd"> 0 1 3 5 7</span> |
| <span class="sd"> 1 2 4 6 8</span> |
| |
| <span class="sd"> >>> df.drop('x', axis=1)</span> |
| <span class="sd"> y z w</span> |
| <span class="sd"> 0 3 5 7</span> |
| <span class="sd"> 1 4 6 8</span> |
| |
| <span class="sd"> >>> df.drop(['y', 'z'], axis=1)</span> |
| <span class="sd"> x w</span> |
| <span class="sd"> 0 1 7</span> |
| <span class="sd"> 1 2 8</span> |
| |
| <span class="sd"> >>> df.drop(columns=['y', 'z'])</span> |
| <span class="sd"> x w</span> |
| <span class="sd"> 0 1 7</span> |
| <span class="sd"> 1 2 8</span> |
| |
| <span class="sd"> Also support for MultiIndex</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]},</span> |
| <span class="sd"> ... columns=['x', 'y', 'z', 'w'])</span> |
| <span class="sd"> >>> columns = [('a', 'x'), ('a', 'y'), ('b', 'z'), ('b', 'w')]</span> |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples(columns)</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> x y z w</span> |
| <span class="sd"> 0 1 3 5 7</span> |
| <span class="sd"> 1 2 4 6 8</span> |
| <span class="sd"> >>> df.drop('a') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b</span> |
| <span class="sd"> z w</span> |
| <span class="sd"> 0 5 7</span> |
| <span class="sd"> 1 6 8</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Currently only axis = 1 is supported in this function,</span> |
| <span class="sd"> axis = 0 is yet to be implemented.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">labels</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="n">labels</span><span class="p">)</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"Drop currently only works for axis=1"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">columns</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">[(</span><span class="n">columns</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">]</span> |
| <span class="n">drop_column_labels</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span> |
| <span class="k">if</span> <span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">col</span><span class="p">)]</span> <span class="o">==</span> <span class="n">col</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">drop_column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> |
| |
| <span class="n">keep_columns_and_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="p">(</span><span class="n">column</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">column</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">drop_column_labels</span> |
| <span class="p">]</span> |
| |
| <span class="n">cols</span><span class="p">,</span> <span class="n">labels</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">keep_columns_and_labels</span><span class="p">)</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">keep_columns_and_labels</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> <span class="k">else</span> <span class="p">([],</span> <span class="p">[])</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">])</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Need to specify at least one of 'labels' or 'columns'"</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_sort</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">ascending</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">bool</span><span class="p">]],</span> <span class="n">na_position</span><span class="p">:</span> <span class="nb">str</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ascending</span><span class="p">,</span> <span class="nb">bool</span><span class="p">):</span> |
| <span class="n">ascending</span> <span class="o">=</span> <span class="p">[</span><span class="n">ascending</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">by</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">ascending</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Length of ascending (</span><span class="si">{}</span><span class="s2">) != length of by (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">ascending</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">by</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">na_position</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"first"</span><span class="p">,</span> <span class="s2">"last"</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"invalid na_position: '</span><span class="si">{}</span><span class="s2">'"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">na_position</span><span class="p">))</span> |
| |
| <span class="c1"># Mapper: Get a spark column function for (ascending, na_position) combination</span> |
| <span class="n">mapper</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">"first"</span><span class="p">):</span> <span class="n">Column</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">,</span> |
| <span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="s2">"last"</span><span class="p">):</span> <span class="n">Column</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">,</span> |
| <span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="s2">"first"</span><span class="p">):</span> <span class="n">Column</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">,</span> |
| <span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="s2">"last"</span><span class="p">):</span> <span class="n">Column</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">,</span> |
| <span class="p">}</span> |
| <span class="n">by</span> <span class="o">=</span> <span class="p">[</span><span class="n">mapper</span><span class="p">[(</span><span class="n">asc</span><span class="p">,</span> <span class="n">na_position</span><span class="p">)](</span><span class="n">scol</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">asc</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ascending</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="o">*</span><span class="n">by</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span> |
| |
| <div class="viewcode-block" id="DataFrame.sort_values"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.sort_values.html#pyspark.pandas.DataFrame.sort_values">[docs]</a> <span class="k">def</span> <span class="nf">sort_values</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]],</span> |
| <span class="n">ascending</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">bool</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">na_position</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"last"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Sort by the values along either axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> by : str or list of str</span> |
| <span class="sd"> ascending : bool or list of bool, default True</span> |
| <span class="sd"> Sort ascending vs. descending. Specify list for multiple sort</span> |
| <span class="sd"> orders. If this is a list of bools, must match the length of</span> |
| <span class="sd"> the by.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> if True, perform operation in-place</span> |
| <span class="sd"> na_position : {'first', 'last'}, default 'last'</span> |
| <span class="sd"> `first` puts NaNs at the beginning, `last` puts NaNs at the end</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sorted_obj : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'col1': ['A', 'B', None, 'D', 'C'],</span> |
| <span class="sd"> ... 'col2': [2, 9, 8, 7, 4],</span> |
| <span class="sd"> ... 'col3': [0, 9, 4, 2, 3],</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['col1', 'col2', 'col3'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> col1 col2 col3</span> |
| <span class="sd"> 0 A 2 0</span> |
| <span class="sd"> 1 B 9 9</span> |
| <span class="sd"> 2 None 8 4</span> |
| <span class="sd"> 3 D 7 2</span> |
| <span class="sd"> 4 C 4 3</span> |
| |
| <span class="sd"> Sort by col1</span> |
| |
| <span class="sd"> >>> df.sort_values(by=['col1'])</span> |
| <span class="sd"> col1 col2 col3</span> |
| <span class="sd"> 0 A 2 0</span> |
| <span class="sd"> 1 B 9 9</span> |
| <span class="sd"> 4 C 4 3</span> |
| <span class="sd"> 3 D 7 2</span> |
| <span class="sd"> 2 None 8 4</span> |
| |
| <span class="sd"> Sort Descending</span> |
| |
| <span class="sd"> >>> df.sort_values(by='col1', ascending=False)</span> |
| <span class="sd"> col1 col2 col3</span> |
| <span class="sd"> 3 D 7 2</span> |
| <span class="sd"> 4 C 4 3</span> |
| <span class="sd"> 1 B 9 9</span> |
| <span class="sd"> 0 A 2 0</span> |
| <span class="sd"> 2 None 8 4</span> |
| |
| <span class="sd"> Sort by multiple columns</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'col1': ['A', 'A', 'B', None, 'D', 'C'],</span> |
| <span class="sd"> ... 'col2': [2, 1, 9, 8, 7, 4],</span> |
| <span class="sd"> ... 'col3': [0, 1, 9, 4, 2, 3],</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['col1', 'col2', 'col3'])</span> |
| <span class="sd"> >>> df.sort_values(by=['col1', 'col2'])</span> |
| <span class="sd"> col1 col2 col3</span> |
| <span class="sd"> 1 A 1 1</span> |
| <span class="sd"> 0 A 2 0</span> |
| <span class="sd"> 2 B 9 9</span> |
| <span class="sd"> 5 C 4 3</span> |
| <span class="sd"> 4 D 7 2</span> |
| <span class="sd"> 3 None 8 4</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="n">by</span> <span class="o">=</span> <span class="p">[</span><span class="n">by</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">by</span><span class="p">),</span> <span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span> |
| |
| <span class="n">new_by</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">colname</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span> |
| <span class="n">ser</span> <span class="o">=</span> <span class="bp">self</span><span class="p">[</span><span class="n">colname</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ser</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"The column </span><span class="si">%s</span><span class="s2"> is not unique. For a multi-index, the label must be a tuple "</span> |
| <span class="s2">"with elements corresponding to each level."</span> <span class="o">%</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">colname</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">ser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sort</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">new_by</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">,</span> <span class="n">na_position</span><span class="o">=</span><span class="n">na_position</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sort_index"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.sort_index.html#pyspark.pandas.DataFrame.sort_index">[docs]</a> <span class="k">def</span> <span class="nf">sort_index</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">kind</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">na_position</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"last"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Sort object by labels (along an axis)</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : index, columns to direct sorting. Currently, only axis = 0 is supported.</span> |
| <span class="sd"> level : int or level name or list of ints or list of level names</span> |
| <span class="sd"> if not None, sort on values in specified index level(s)</span> |
| <span class="sd"> ascending : boolean, default True</span> |
| <span class="sd"> Sort ascending vs. descending</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> if True, perform operation in-place</span> |
| <span class="sd"> kind : str, default None</span> |
| <span class="sd"> pandas-on-Spark does not allow specifying the sorting algorithm at the moment,</span> |
| <span class="sd"> default None</span> |
| <span class="sd"> na_position : {‘first’, ‘last’}, default ‘last’</span> |
| <span class="sd"> first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for</span> |
| <span class="sd"> MultiIndex.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> sorted_obj : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [2, 1, np.nan]}, index=['b', 'a', np.nan])</span> |
| |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> NaN NaN</span> |
| |
| <span class="sd"> >>> df.sort_index(ascending=False)</span> |
| <span class="sd"> A</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> NaN NaN</span> |
| |
| <span class="sd"> >>> df.sort_index(na_position='first')</span> |
| <span class="sd"> A</span> |
| <span class="sd"> NaN NaN</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 2.0</span> |
| |
| <span class="sd"> >>> df.sort_index(inplace=True)</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A</span> |
| <span class="sd"> a 1.0</span> |
| <span class="sd"> b 2.0</span> |
| <span class="sd"> NaN NaN</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': range(4), 'B': range(4)[::-1]},</span> |
| <span class="sd"> ... index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]],</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> a 0 3 0</span> |
| <span class="sd"> 1 2 1</span> |
| <span class="sd"> b 0 1 2</span> |
| <span class="sd"> 1 0 3</span> |
| |
| <span class="sd"> >>> df.sort_index(level=1) # doctest: +SKIP</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> a 0 3 0</span> |
| <span class="sd"> b 0 1 2</span> |
| <span class="sd"> a 1 2 1</span> |
| <span class="sd"> b 1 0 3</span> |
| |
| <span class="sd"> >>> df.sort_index(level=[1, 0])</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> a 0 3 0</span> |
| <span class="sd"> b 0 1 2</span> |
| <span class="sd"> a 1 2 1</span> |
| <span class="sd"> b 1 0 3</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"No other axis than 0 are supported at the moment"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">kind</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"Specifying the sorting algorithm is not supported at the moment."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="p">(</span><span class="n">is_list_like</span><span class="p">(</span><span class="n">level</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">level</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">):</span> <span class="c1"># type: ignore</span> |
| <span class="n">by</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">level</span><span class="p">):</span> |
| <span class="n">by</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">l</span><span class="p">]</span> <span class="k">for</span> <span class="n">l</span> <span class="ow">in</span> <span class="n">level</span><span class="p">]</span> <span class="c1"># type: ignore</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">by</span> <span class="o">=</span> <span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">level</span><span class="p">]]</span> <span class="c1"># type: ignore</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sort</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">,</span> <span class="n">na_position</span><span class="o">=</span><span class="n">na_position</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.swaplevel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.swaplevel.html#pyspark.pandas.DataFrame.swaplevel">[docs]</a> <span class="k">def</span> <span class="nf">swaplevel</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">2</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Swap levels i and j in a MultiIndex on a particular axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> i, j : int or str</span> |
| <span class="sd"> Levels of the indices to be swapped. Can pass level name as string.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| <span class="sd"> The axis to swap levels on. 0 or 'index' for row-wise, 1 or</span> |
| <span class="sd"> 'columns' for column-wise.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with levels swapped in MultiIndex.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> midx = pd.MultiIndex.from_arrays(</span> |
| <span class="sd"> ... [['red', 'blue'], [1, 2], ['s', 'm']], names = ['color', 'number', 'size'])</span> |
| <span class="sd"> >>> midx # doctest: +SKIP</span> |
| <span class="sd"> MultiIndex([( 'red', 1, 's'),</span> |
| <span class="sd"> ('blue', 2, 'm')],</span> |
| <span class="sd"> names=['color', 'number', 'size'])</span> |
| |
| <span class="sd"> Swap levels in a MultiIndex on index.</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'x': [5, 6], 'y':[5, 6]}, index=midx)</span> |
| <span class="sd"> >>> psdf # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> color number size</span> |
| <span class="sd"> red 1 s 5 5</span> |
| <span class="sd"> blue 2 m 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> color size number</span> |
| <span class="sd"> red s 1 5 5</span> |
| <span class="sd"> blue m 2 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel(0, 1) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> number color size</span> |
| <span class="sd"> 1 red s 5 5</span> |
| <span class="sd"> 2 blue m 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel('number', 'size') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> color size number</span> |
| <span class="sd"> red s 1 5 5</span> |
| <span class="sd"> blue m 2 6 6</span> |
| |
| <span class="sd"> Swap levels in a MultiIndex on columns.</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'x': [5, 6], 'y':[5, 6]})</span> |
| <span class="sd"> >>> psdf.columns = midx</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> color red blue</span> |
| <span class="sd"> number 1 2</span> |
| <span class="sd"> size s m</span> |
| <span class="sd"> 0 5 5</span> |
| <span class="sd"> 1 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel(axis=1)</span> |
| <span class="sd"> color red blue</span> |
| <span class="sd"> size s m</span> |
| <span class="sd"> number 1 2</span> |
| <span class="sd"> 0 5 5</span> |
| <span class="sd"> 1 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel(axis=1)</span> |
| <span class="sd"> color red blue</span> |
| <span class="sd"> size s m</span> |
| <span class="sd"> number 1 2</span> |
| <span class="sd"> 0 5 5</span> |
| <span class="sd"> 1 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel(0, 1, axis=1)</span> |
| <span class="sd"> number 1 2</span> |
| <span class="sd"> color red blue</span> |
| <span class="sd"> size s m</span> |
| <span class="sd"> 0 5 5</span> |
| <span class="sd"> 1 6 6</span> |
| |
| <span class="sd"> >>> psdf.swaplevel('number', 'color', axis=1)</span> |
| <span class="sd"> number 1 2</span> |
| <span class="sd"> color red blue</span> |
| <span class="sd"> size s m</span> |
| <span class="sd"> 0 5 5</span> |
| <span class="sd"> 1 6 6</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_swaplevel_index</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_swaplevel_columns</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.swapaxes"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.swapaxes.html#pyspark.pandas.DataFrame.swapaxes">[docs]</a> <span class="k">def</span> <span class="nf">swapaxes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">:</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">copy</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Interchange axes and swap values axes appropriately.</span> |
| |
| <span class="sd"> .. note:: This method is based on an expensive operation due to the nature</span> |
| <span class="sd"> of big data. Internally it needs to generate each row for each value, and</span> |
| <span class="sd"> then group twice - it is a huge operation. To prevent misusage, this method</span> |
| <span class="sd"> has the 'compute.max_rows' default limit of input length, and raises a ValueError.</span> |
| |
| <span class="sd"> >>> from pyspark.pandas.config import option_context</span> |
| <span class="sd"> >>> with option_context('compute.max_rows', 1000): # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> ... ps.DataFrame({'a': range(1001)}).swapaxes(i=0, j=1)</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ValueError: Current DataFrame has more then the given limit 1000 rows.</span> |
| <span class="sd"> Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option'</span> |
| <span class="sd"> to retrieve to retrieve more than 1000 rows. Note that, before changing the</span> |
| <span class="sd"> 'compute.max_rows', this operation is considerably expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> i: {0 or 'index', 1 or 'columns'}. The axis to swap.</span> |
| <span class="sd"> j: {0 or 'index', 1 or 'columns'}. The axis to swap.</span> |
| <span class="sd"> copy : bool, default True.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['x', 'y', 'z'], columns=['a', 'b', 'c']</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> x 1 2 3</span> |
| <span class="sd"> y 4 5 6</span> |
| <span class="sd"> z 7 8 9</span> |
| <span class="sd"> >>> psdf.swapaxes(i=1, j=0)</span> |
| <span class="sd"> x y z</span> |
| <span class="sd"> a 1 4 7</span> |
| <span class="sd"> b 2 5 8</span> |
| <span class="sd"> c 3 6 9</span> |
| <span class="sd"> >>> psdf.swapaxes(i=1, j=1)</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> x 1 2 3</span> |
| <span class="sd"> y 4 5 6</span> |
| <span class="sd"> z 7 8 9</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">copy</span> <span class="ow">is</span> <span class="kc">True</span> |
| |
| <span class="n">i</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">j</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">i</span> <span class="o">==</span> <span class="n">j</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">transpose</span><span class="p">()</span></div> |
| |
| <span class="k">def</span> <span class="nf">_swaplevel_columns</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">],</span> <span class="n">j</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">])</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">and</span> <span class="n">index</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"Level </span><span class="si">%s</span><span class="s2"> not found"</span> <span class="o">%</span> <span class="n">index</span><span class="p">)</span> |
| |
| <span class="n">i</span> <span class="o">=</span> <span class="n">i</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">names</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">j</span> <span class="o">=</span> <span class="n">j</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">j</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">names</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">index</span> <span class="o">>=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="ow">or</span> <span class="n">index</span> <span class="o"><</span> <span class="o">-</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Columns have only </span><span class="si">%s</span><span class="s2"> levels, "</span> |
| <span class="s2">"</span><span class="si">%s</span><span class="s2"> is not a valid level number"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">,</span> <span class="n">index</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">column_label_names</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">column_label_names</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">column_label_names</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">_column_labels</span> |
| <span class="n">column_label_list</span> <span class="o">=</span> <span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">label_list</span> <span class="ow">in</span> <span class="n">column_label_list</span><span class="p">:</span> |
| <span class="n">label_list</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">label_list</span><span class="p">[</span><span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="n">label_list</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> <span class="n">label_list</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">column_label_list</span><span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">column_label_names</span><span class="p">),</span> <span class="n">column_labels</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">internal</span> |
| |
| <span class="k">def</span> <span class="nf">_swaplevel_index</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">],</span> <span class="n">j</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Name</span><span class="p">])</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="ow">and</span> <span class="n">index</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"Level </span><span class="si">%s</span><span class="s2"> not found"</span> <span class="o">%</span> <span class="n">index</span><span class="p">)</span> |
| |
| <span class="n">i</span> <span class="o">=</span> <span class="n">i</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="n">j</span> <span class="o">=</span> <span class="n">j</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">j</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index</span> <span class="ow">in</span> <span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">index</span> <span class="o">>=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="ow">or</span> <span class="n">index</span> <span class="o"><</span> <span class="o">-</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">IndexError</span><span class="p">(</span> |
| <span class="s2">"Too many levels: Index has only </span><span class="si">%s</span><span class="s2"> levels, "</span> |
| <span class="s2">"</span><span class="si">%s</span><span class="s2"> is not a valid level number"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">,</span> <span class="n">index</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">index_map</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> |
| <span class="nb">zip</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">index_map</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">index_map</span><span class="p">[</span><span class="n">j</span><span class="p">]</span> <span class="o">=</span> <span class="n">index_map</span><span class="p">[</span><span class="n">j</span><span class="p">],</span> <span class="n">index_map</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> |
| <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="p">,</span> <span class="n">index_fields</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">index_map</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_spark_columns</span><span class="p">),</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_names</span><span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_fields</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">internal</span> |
| |
| <span class="c1"># TODO: add keep = First</span> |
| <div class="viewcode-block" id="DataFrame.nlargest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.nlargest.html#pyspark.pandas.DataFrame.nlargest">[docs]</a> <span class="k">def</span> <span class="nf">nlargest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">columns</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the first `n` rows ordered by `columns` in descending order.</span> |
| |
| <span class="sd"> Return the first `n` rows with the largest values in `columns`, in</span> |
| <span class="sd"> descending order. The columns that are not specified are returned as</span> |
| <span class="sd"> well, but not used for ordering.</span> |
| |
| <span class="sd"> This method is equivalent to</span> |
| <span class="sd"> ``df.sort_values(columns, ascending=False).head(n)``, but more</span> |
| <span class="sd"> performant in pandas.</span> |
| <span class="sd"> In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer,</span> |
| <span class="sd"> the two would have same performance.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> Number of rows to return.</span> |
| <span class="sd"> columns : label or list of labels</span> |
| <span class="sd"> Column label(s) to order by.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> The first `n` rows ordered by the given columns in descending</span> |
| <span class="sd"> order.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in</span> |
| <span class="sd"> ascending order.</span> |
| <span class="sd"> DataFrame.sort_values : Sort DataFrame by the values.</span> |
| <span class="sd"> DataFrame.head : Return the first `n` rows without re-ordering.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| |
| <span class="sd"> This function cannot be used with all column types. For example, when</span> |
| <span class="sd"> specifying columns with `object` or `category` dtypes, ``TypeError`` is</span> |
| <span class="sd"> raised.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'X': [1, 2, 3, 5, 6, 7, np.nan],</span> |
| <span class="sd"> ... 'Y': [6, 7, 8, 9, 10, 11, 12]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> 0 1.0 6</span> |
| <span class="sd"> 1 2.0 7</span> |
| <span class="sd"> 2 3.0 8</span> |
| <span class="sd"> 3 5.0 9</span> |
| <span class="sd"> 4 6.0 10</span> |
| <span class="sd"> 5 7.0 11</span> |
| <span class="sd"> 6 NaN 12</span> |
| |
| <span class="sd"> In the following example, we will use ``nlargest`` to select the three</span> |
| <span class="sd"> rows having the largest values in column "X".</span> |
| |
| <span class="sd"> >>> df.nlargest(n=3, columns='X')</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> 5 7.0 11</span> |
| <span class="sd"> 4 6.0 10</span> |
| <span class="sd"> 3 5.0 9</span> |
| |
| <span class="sd"> To order by the largest values in column "Y" and then "X", we can</span> |
| <span class="sd"> specify multiple columns like in the next example.</span> |
| |
| <span class="sd"> >>> df.nlargest(n=3, columns=['Y', 'X'])</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> 6 NaN 12</span> |
| <span class="sd"> 5 7.0 11</span> |
| <span class="sd"> 4 6.0 10</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">n</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: add keep = First</span> |
| <div class="viewcode-block" id="DataFrame.nsmallest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.nsmallest.html#pyspark.pandas.DataFrame.nsmallest">[docs]</a> <span class="k">def</span> <span class="nf">nsmallest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">columns</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the first `n` rows ordered by `columns` in ascending order.</span> |
| |
| <span class="sd"> Return the first `n` rows with the smallest values in `columns`, in</span> |
| <span class="sd"> ascending order. The columns that are not specified are returned as</span> |
| <span class="sd"> well, but not used for ordering.</span> |
| |
| <span class="sd"> This method is equivalent to ``df.sort_values(columns, ascending=True).head(n)``,</span> |
| <span class="sd"> but more performant. In pandas-on-Spark, thanks to Spark's lazy execution and query</span> |
| <span class="sd"> optimizer, the two would have same performance.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> Number of items to retrieve.</span> |
| <span class="sd"> columns : list or str</span> |
| <span class="sd"> Column name or names to order by.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.nlargest : Return the first `n` rows ordered by `columns` in</span> |
| <span class="sd"> descending order.</span> |
| <span class="sd"> DataFrame.sort_values : Sort DataFrame by the values.</span> |
| <span class="sd"> DataFrame.head : Return the first `n` rows without re-ordering.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'X': [1, 2, 3, 5, 6, 7, np.nan],</span> |
| <span class="sd"> ... 'Y': [6, 7, 8, 9, 10, 11, 12]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> 0 1.0 6</span> |
| <span class="sd"> 1 2.0 7</span> |
| <span class="sd"> 2 3.0 8</span> |
| <span class="sd"> 3 5.0 9</span> |
| <span class="sd"> 4 6.0 10</span> |
| <span class="sd"> 5 7.0 11</span> |
| <span class="sd"> 6 NaN 12</span> |
| |
| <span class="sd"> In the following example, we will use ``nsmallest`` to select the</span> |
| <span class="sd"> three rows having the smallest values in column "X".</span> |
| |
| <span class="sd"> >>> df.nsmallest(n=3, columns='X') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> 0 1.0 6</span> |
| <span class="sd"> 1 2.0 7</span> |
| <span class="sd"> 2 3.0 8</span> |
| |
| <span class="sd"> To order by the smallest values in column "Y" and then "X", we can</span> |
| <span class="sd"> specify multiple columns like in the next example.</span> |
| |
| <span class="sd"> >>> df.nsmallest(n=3, columns=['Y', 'X']) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> X Y</span> |
| <span class="sd"> 0 1.0 6</span> |
| <span class="sd"> 1 2.0 7</span> |
| <span class="sd"> 2 3.0 8</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="n">n</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.isin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.isin.html#pyspark.pandas.DataFrame.isin">[docs]</a> <span class="k">def</span> <span class="nf">isin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">values</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">Dict</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Whether each element in the DataFrame is contained in values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> values : iterable or dict</span> |
| <span class="sd"> The sequence of values to test. If values is a dict,</span> |
| <span class="sd"> the keys must be the column names, which must match.</span> |
| <span class="sd"> Series and DataFrame are not supported.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame of booleans showing whether each element in the DataFrame</span> |
| <span class="sd"> is contained in values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},</span> |
| <span class="sd"> ... index=['falcon', 'dog'],</span> |
| <span class="sd"> ... columns=['num_legs', 'num_wings'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> falcon 2 2</span> |
| <span class="sd"> dog 4 0</span> |
| |
| <span class="sd"> When ``values`` is a list check whether every value in the DataFrame</span> |
| <span class="sd"> is present in the list (which animals have 0 or 2 legs or wings)</span> |
| |
| <span class="sd"> >>> df.isin([0, 2])</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> falcon True True</span> |
| <span class="sd"> dog False True</span> |
| |
| <span class="sd"> When ``values`` is a dict, we can pass values to check for each</span> |
| <span class="sd"> column separately:</span> |
| |
| <span class="sd"> >>> df.isin({'num_wings': [0, 3]})</span> |
| <span class="sd"> num_legs num_wings</span> |
| <span class="sd"> falcon False False</span> |
| <span class="sd"> dog False True</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"DataFrame and Series are not supported"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">set</span><span class="p">(</span><span class="n">values</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span> |
| <span class="s2">"'DataFrame' object has no attribute </span><span class="si">%s</span><span class="s2">"</span> |
| <span class="o">%</span> <span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">values</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span><span class="o">.</span><span class="n">difference</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="n">data_spark_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">values</span><span class="p">:</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="n">item</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="k">else</span> <span class="nb">list</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">item</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span> |
| <span class="n">data_spark_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="n">i</span><span class="p">]))</span> |
| <span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">values</span><span class="p">):</span> |
| <span class="n">values</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">cast</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">values</span><span class="p">)</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> |
| <span class="k">else</span> <span class="nb">list</span><span class="p">(</span><span class="n">values</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">isin</span><span class="p">([</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">values</span><span class="p">])</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span> |
| <span class="n">data_spark_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Values should be iterable, Series, DataFrame or dict."</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="n">data_spark_columns</span><span class="p">,</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">(</span><span class="s2">"bool"</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">=</span><span class="n">BooleanType</span><span class="p">(),</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">shape</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a tuple representing the dimensionality of the DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4]})</span> |
| <span class="sd"> >>> df.shape</span> |
| <span class="sd"> (2, 2)</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4],</span> |
| <span class="sd"> ... 'col3': [5, 6]})</span> |
| <span class="sd"> >>> df.shape</span> |
| <span class="sd"> (2, 3)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.merge"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.merge.html#pyspark.pandas.DataFrame.merge">[docs]</a> <span class="k">def</span> <span class="nf">merge</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">right</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"inner"</span><span class="p">,</span> |
| <span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">"_x"</span><span class="p">,</span> <span class="s2">"_y"</span><span class="p">),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Merge DataFrame objects with a database-style join.</span> |
| |
| <span class="sd"> The index of the resulting DataFrame will be one of the following:</span> |
| <span class="sd"> - 0...n if no index is used for merging</span> |
| <span class="sd"> - Index of the left DataFrame if merged only on the index of the right DataFrame</span> |
| <span class="sd"> - Index of the right DataFrame if merged only on the index of the left DataFrame</span> |
| <span class="sd"> - All involved indices if merged using the indices of both DataFrames</span> |
| <span class="sd"> e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will</span> |
| <span class="sd"> be an index (x, a, b)</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> right: Object to merge with.</span> |
| <span class="sd"> how: Type of merge to be performed.</span> |
| <span class="sd"> {'left', 'right', 'outer', 'inner'}, default 'inner'</span> |
| |
| <span class="sd"> left: use only keys from left frame, similar to a SQL left outer join; not preserve</span> |
| <span class="sd"> key order unlike pandas.</span> |
| <span class="sd"> right: use only keys from right frame, similar to a SQL right outer join; not preserve</span> |
| <span class="sd"> key order unlike pandas.</span> |
| <span class="sd"> outer: use union of keys from both frames, similar to a SQL full outer join; sort keys</span> |
| <span class="sd"> lexicographically.</span> |
| <span class="sd"> inner: use intersection of keys from both frames, similar to a SQL inner join;</span> |
| <span class="sd"> not preserve the order of the left keys unlike pandas.</span> |
| <span class="sd"> on: Column or index level names to join on. These must be found in both DataFrames. If on</span> |
| <span class="sd"> is None and not merging on indexes then this defaults to the intersection of the</span> |
| <span class="sd"> columns in both DataFrames.</span> |
| <span class="sd"> left_on: Column or index level names to join on in the left DataFrame. Can also</span> |
| <span class="sd"> be an array or list of arrays of the length of the left DataFrame.</span> |
| <span class="sd"> These arrays are treated as if they are columns.</span> |
| <span class="sd"> right_on: Column or index level names to join on in the right DataFrame. Can also</span> |
| <span class="sd"> be an array or list of arrays of the length of the right DataFrame.</span> |
| <span class="sd"> These arrays are treated as if they are columns.</span> |
| <span class="sd"> left_index: Use the index from the left DataFrame as the join key(s). If it is a</span> |
| <span class="sd"> MultiIndex, the number of keys in the other DataFrame (either the index or a number of</span> |
| <span class="sd"> columns) must match the number of levels.</span> |
| <span class="sd"> right_index: Use the index from the right DataFrame as the join key. Same caveats as</span> |
| <span class="sd"> left_index.</span> |
| <span class="sd"> suffixes: Suffix to apply to overlapping column names in the left and right side,</span> |
| <span class="sd"> respectively.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A DataFrame of the two merged objects.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.join : Join columns of another DataFrame.</span> |
| <span class="sd"> DataFrame.update : Modify in place using non-NA values from another DataFrame.</span> |
| <span class="sd"> DataFrame.hint : Specifies some hint on the current DataFrame.</span> |
| <span class="sd"> broadcast : Marks a DataFrame as small enough for use in broadcast joins.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],</span> |
| <span class="sd"> ... 'value': [1, 2, 3, 5]},</span> |
| <span class="sd"> ... columns=['lkey', 'value'])</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],</span> |
| <span class="sd"> ... 'value': [5, 6, 7, 8]},</span> |
| <span class="sd"> ... columns=['rkey', 'value'])</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> lkey value</span> |
| <span class="sd"> 0 foo 1</span> |
| <span class="sd"> 1 bar 2</span> |
| <span class="sd"> 2 baz 3</span> |
| <span class="sd"> 3 foo 5</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> rkey value</span> |
| <span class="sd"> 0 foo 5</span> |
| <span class="sd"> 1 bar 6</span> |
| <span class="sd"> 2 baz 7</span> |
| <span class="sd"> 3 foo 8</span> |
| |
| <span class="sd"> Merge df1 and df2 on the lkey and rkey columns. The value columns have</span> |
| <span class="sd"> the default suffixes, _x and _y, appended.</span> |
| |
| <span class="sd"> >>> merged = df1.merge(df2, left_on='lkey', right_on='rkey')</span> |
| <span class="sd"> >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) # doctest: +ELLIPSIS</span> |
| <span class="sd"> lkey value_x rkey value_y</span> |
| <span class="sd"> ...bar 2 bar 6</span> |
| <span class="sd"> ...baz 3 baz 7</span> |
| <span class="sd"> ...foo 1 foo 5</span> |
| <span class="sd"> ...foo 1 foo 8</span> |
| <span class="sd"> ...foo 5 foo 5</span> |
| <span class="sd"> ...foo 5 foo 8</span> |
| |
| <span class="sd"> >>> left_psdf = ps.DataFrame({'A': [1, 2]})</span> |
| <span class="sd"> >>> right_psdf = ps.DataFrame({'B': ['x', 'y']}, index=[1, 2])</span> |
| |
| <span class="sd"> >>> left_psdf.merge(right_psdf, left_index=True, right_index=True).sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 2 x</span> |
| |
| <span class="sd"> >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='left').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 None</span> |
| <span class="sd"> 1 2 x</span> |
| |
| <span class="sd"> >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='right').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 2.0 x</span> |
| <span class="sd"> 2 NaN y</span> |
| |
| <span class="sd"> >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='outer').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 None</span> |
| <span class="sd"> 1 2.0 x</span> |
| <span class="sd"> 2 NaN y</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> As described in #263, joining string columns currently returns None for missing values</span> |
| <span class="sd"> instead of NaN.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">to_list</span><span class="p">(</span><span class="n">os</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="n">os</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">[]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">os</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">os</span><span class="p">]</span> <span class="c1"># type: ignore</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">os</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">[(</span><span class="n">os</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">o</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">o</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">o</span><span class="p">,)</span> <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="n">os</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">on</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">left_on</span> <span class="ow">or</span> <span class="n">right_on</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s1">'Can only pass argument "on" OR "left_on" and "right_on", '</span> |
| <span class="s2">"not a combination of both."</span> |
| <span class="p">)</span> |
| <span class="n">left_key_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span> |
| <span class="n">right_key_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># TODO: need special handling for multi-index.</span> |
| <span class="k">if</span> <span class="n">left_index</span><span class="p">:</span> |
| <span class="n">left_key_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">left_key_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_on</span><span class="p">)))</span> |
| <span class="k">if</span> <span class="n">right_index</span><span class="p">:</span> |
| <span class="n">right_key_names</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">right_key_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> |
| <span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_on</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">left_key_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_key_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must pass right_on or right_index=True"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">right_key_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_key_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must pass left_on or left_index=True"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">left_key_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_key_names</span><span class="p">:</span> |
| <span class="n">common</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">common</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"No common columns to perform merge on. Merge options: "</span> |
| <span class="s2">"left_on=None, right_on=None, left_index=False, right_index=False"</span> |
| <span class="p">)</span> |
| <span class="n">left_key_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span> |
| <span class="n">right_key_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_key_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_key_names</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"len(left_keys) must equal len(right_keys)"</span><span class="p">)</span> |
| |
| <span class="c1"># We should distinguish the name to avoid ambiguous column name after merging.</span> |
| <span class="n">right_prefix</span> <span class="o">=</span> <span class="s2">"__right_"</span> |
| <span class="n">right_key_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_key_name</span> <span class="k">for</span> <span class="n">right_key_name</span> <span class="ow">in</span> <span class="n">right_key_names</span><span class="p">]</span> |
| |
| <span class="n">how</span> <span class="o">=</span> <span class="n">validate_how</span><span class="p">(</span><span class="n">how</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">resolve</span><span class="p">(</span><span class="n">internal</span><span class="p">:</span> <span class="n">InternalFrame</span><span class="p">,</span> <span class="n">side</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="n">rename</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="s2">"__</span><span class="si">{}</span><span class="s2">_</span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">side</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">HIDDEN_COLUMNS</span> |
| <span class="p">],</span> |
| <span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="n">left_internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="n">right_internal</span> <span class="o">=</span> <span class="n">resolve</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="s2">"right"</span><span class="p">)</span> |
| |
| <span class="n">left_table</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"left_table"</span><span class="p">)</span> |
| <span class="n">right_table</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"right_table"</span><span class="p">)</span> |
| |
| <span class="n">left_key_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_key_names</span><span class="p">]</span> |
| <span class="n">right_key_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_key_names</span><span class="p">]</span> |
| |
| <span class="n">join_condition</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&</span> <span class="n">y</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">lkey</span> <span class="o">==</span> <span class="n">rkey</span> <span class="k">for</span> <span class="n">lkey</span><span class="p">,</span> <span class="n">rkey</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_key_columns</span><span class="p">,</span> <span class="n">right_key_columns</span><span class="p">)],</span> |
| <span class="p">)</span> |
| |
| <span class="n">joined_table</span> <span class="o">=</span> <span class="n">left_table</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">join_condition</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">)</span> |
| |
| <span class="c1"># Unpack suffixes tuple for convenience</span> |
| <span class="n">left_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">right_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> |
| |
| <span class="c1"># Append suffixes to columns with the same name to avoid conflicts later</span> |
| <span class="n">duplicate_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&</span> <span class="nb">set</span><span class="p">(</span><span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> |
| |
| <span class="n">exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="n">left_scol_for</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">label</span><span class="p">:</span> <span class="n">scol_for</span><span class="p">(</span> |
| <span class="n">left_table</span><span class="p">,</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">right_scol_for</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">label</span><span class="p">:</span> <span class="n">scol_for</span><span class="p">(</span> |
| <span class="n">right_table</span><span class="p">,</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span> |
| <span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="n">left_key_names</span> |
| <span class="ow">and</span> <span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">right_key_names</span> |
| <span class="p">):</span> |
| <span class="n">right_scol</span> <span class="o">=</span> <span class="n">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"right"</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">right_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"full"</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">scol</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">right_scol</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">left_suffix</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">left_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="c1"># recover `right_prefix` here.</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)[</span><span class="nb">len</span><span class="p">(</span><span class="n">right_prefix</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span> |
| <span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="n">left_key_names</span> |
| <span class="ow">and</span> <span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">right_key_names</span> |
| <span class="p">):</span> |
| <span class="k">continue</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">right_suffix</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">right_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="n">left_index_scols</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="n">right_index_scols</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| |
| <span class="c1"># Retain indices if they are used for joining</span> |
| <span class="k">if</span> <span class="n">left_index</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">right_index</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">how</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">"inner"</span><span class="p">,</span> <span class="s2">"left"</span><span class="p">):</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">left_index_scols</span><span class="p">)</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"right"</span><span class="p">:</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">right_index_scols</span><span class="p">)</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">left_scol</span><span class="p">,</span> <span class="n">right_scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="n">index_spark_column_names</span><span class="p">,</span> <span class="n">left_index_scols</span><span class="p">,</span> <span class="n">right_index_scols</span> |
| <span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">left_scol</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">left_scol</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">right_scol</span><span class="p">)</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">right_index_scols</span><span class="p">)</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="k">elif</span> <span class="n">right_index</span><span class="p">:</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">left_index_scols</span><span class="p">)</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span> |
| |
| <span class="n">selected_columns</span> <span class="o">=</span> <span class="n">joined_table</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">selected_columns</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.join"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.join.html#pyspark.pandas.DataFrame.join">[docs]</a> <span class="k">def</span> <span class="nf">join</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">right</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> |
| <span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"left"</span><span class="p">,</span> |
| <span class="n">lsuffix</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">""</span><span class="p">,</span> |
| <span class="n">rsuffix</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">""</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Join columns of another DataFrame.</span> |
| |
| <span class="sd"> Join columns with `right` DataFrame either on index or on a key column. Efficiently join</span> |
| <span class="sd"> multiple DataFrame objects by index at once by passing a list.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> right: DataFrame, Series</span> |
| <span class="sd"> on: str, list of str, or array-like, optional</span> |
| <span class="sd"> Column or index level name(s) in the caller to join on the index in `right`, otherwise</span> |
| <span class="sd"> joins index-on-index. If multiple values given, the `right` DataFrame must have a</span> |
| <span class="sd"> MultiIndex. Can pass an array as the join key if it is not already contained in the</span> |
| <span class="sd"> calling DataFrame. Like an Excel VLOOKUP operation.</span> |
| <span class="sd"> how: {'left', 'right', 'outer', 'inner'}, default 'left'</span> |
| <span class="sd"> How to handle the operation of the two objects.</span> |
| |
| <span class="sd"> * left: use `left` frame’s index (or column if on is specified).</span> |
| <span class="sd"> * right: use `right`’s index.</span> |
| <span class="sd"> * outer: form union of `left` frame’s index (or column if on is specified) with</span> |
| <span class="sd"> right’s index, and sort it. lexicographically.</span> |
| <span class="sd"> * inner: form intersection of `left` frame’s index (or column if on is specified)</span> |
| <span class="sd"> with `right`’s index, preserving the order of the `left`’s one.</span> |
| <span class="sd"> lsuffix : str, default ''</span> |
| <span class="sd"> Suffix to use from left frame's overlapping columns.</span> |
| <span class="sd"> rsuffix : str, default ''</span> |
| <span class="sd"> Suffix to use from `right` frame's overlapping columns.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> A dataframe containing columns from both the `left` and `right`.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.merge: For column(s)-on-columns(s) operations.</span> |
| <span class="sd"> DataFrame.update : Modify in place using non-NA values from another DataFrame.</span> |
| <span class="sd"> DataFrame.hint : Specifies some hint on the current DataFrame.</span> |
| <span class="sd"> broadcast : Marks a DataFrame as small enough for use in broadcast joins.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Parameters on, lsuffix, and rsuffix are not supported when passing a list of DataFrame</span> |
| <span class="sd"> objects.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf1 = ps.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],</span> |
| <span class="sd"> ... 'A': ['A0', 'A1', 'A2', 'A3']},</span> |
| <span class="sd"> ... columns=['key', 'A'])</span> |
| <span class="sd"> >>> psdf2 = ps.DataFrame({'key': ['K0', 'K1', 'K2'],</span> |
| <span class="sd"> ... 'B': ['B0', 'B1', 'B2']},</span> |
| <span class="sd"> ... columns=['key', 'B'])</span> |
| <span class="sd"> >>> psdf1</span> |
| <span class="sd"> key A</span> |
| <span class="sd"> 0 K0 A0</span> |
| <span class="sd"> 1 K1 A1</span> |
| <span class="sd"> 2 K2 A2</span> |
| <span class="sd"> 3 K3 A3</span> |
| <span class="sd"> >>> psdf2</span> |
| <span class="sd"> key B</span> |
| <span class="sd"> 0 K0 B0</span> |
| <span class="sd"> 1 K1 B1</span> |
| <span class="sd"> 2 K2 B2</span> |
| |
| <span class="sd"> Join DataFrames using their indexes.</span> |
| |
| <span class="sd"> >>> join_psdf = psdf1.join(psdf2, lsuffix='_left', rsuffix='_right')</span> |
| <span class="sd"> >>> join_psdf.sort_values(by=join_psdf.columns)</span> |
| <span class="sd"> key_left A key_right B</span> |
| <span class="sd"> 0 K0 A0 K0 B0</span> |
| <span class="sd"> 1 K1 A1 K1 B1</span> |
| <span class="sd"> 2 K2 A2 K2 B2</span> |
| <span class="sd"> 3 K3 A3 None None</span> |
| |
| <span class="sd"> If we want to join using the key columns, we need to set key to be the index in both df and</span> |
| <span class="sd"> right. The joined DataFrame will have key as its index.</span> |
| |
| <span class="sd"> >>> join_psdf = psdf1.set_index('key').join(psdf2.set_index('key'))</span> |
| <span class="sd"> >>> join_psdf.sort_values(by=join_psdf.columns) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> key</span> |
| <span class="sd"> K0 A0 B0</span> |
| <span class="sd"> K1 A1 B1</span> |
| <span class="sd"> K2 A2 B2</span> |
| <span class="sd"> K3 A3 None</span> |
| |
| <span class="sd"> Another option to join using the key columns is to use the on parameter. DataFrame.join</span> |
| <span class="sd"> always uses right’s index but we can use any column in df. This method not preserve the</span> |
| <span class="sd"> original DataFrame’s index in the result unlike pandas.</span> |
| |
| <span class="sd"> >>> join_psdf = psdf1.join(psdf2.set_index('key'), on='key')</span> |
| <span class="sd"> >>> join_psdf.index</span> |
| <span class="sd"> Int64Index([0, 1, 2, 3], dtype='int64')</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">common</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">intersection</span><span class="p">([</span><span class="n">right</span><span class="o">.</span><span class="n">name</span><span class="p">]))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">common</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">common</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">lsuffix</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">rsuffix</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"columns overlap but no suffix specified: "</span> <span class="s2">"</span><span class="si">{rename}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">rename</span><span class="o">=</span><span class="n">common</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">need_set_index</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="k">if</span> <span class="n">on</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">on</span><span class="p">):</span> |
| <span class="n">on</span> <span class="o">=</span> <span class="p">[</span><span class="n">on</span><span class="p">]</span> <span class="c1"># type: ignore</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">on</span><span class="p">)</span> <span class="o">!=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s1">'len(left_on) must equal the number of levels in the index of "right"'</span> |
| <span class="p">)</span> |
| |
| <span class="n">need_set_index</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">on</span><span class="p">)</span> <span class="o">&</span> <span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span><span class="p">))</span> <span class="o">==</span> <span class="mi">0</span> |
| <span class="k">if</span> <span class="n">need_set_index</span><span class="p">:</span> |
| <span class="bp">self</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="n">on</span><span class="p">)</span> |
| <span class="n">join_psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span> |
| <span class="n">right</span><span class="p">,</span> <span class="n">left_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">right_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">,</span> <span class="n">suffixes</span><span class="o">=</span><span class="p">(</span><span class="n">lsuffix</span><span class="p">,</span> <span class="n">rsuffix</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">join_psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> <span class="k">if</span> <span class="n">need_set_index</span> <span class="k">else</span> <span class="n">join_psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.append"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.append.html#pyspark.pandas.DataFrame.append">[docs]</a> <span class="k">def</span> <span class="nf">append</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> |
| <span class="n">ignore_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">verify_integrity</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Append rows of other to the end of caller, returning a new object.</span> |
| |
| <span class="sd"> Columns in other that are not in the caller are added as new columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : DataFrame or Series/dict-like object, or list of these</span> |
| <span class="sd"> The data to append.</span> |
| |
| <span class="sd"> ignore_index : boolean, default False</span> |
| <span class="sd"> If True, do not use the index labels.</span> |
| |
| <span class="sd"> verify_integrity : boolean, default False</span> |
| <span class="sd"> If True, raise ValueError on creating index with duplicates.</span> |
| |
| <span class="sd"> sort : boolean, default False</span> |
| <span class="sd"> Currently not supported.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> appended : DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2], [3, 4]], columns=list('AB'))</span> |
| |
| <span class="sd"> >>> df.append(df)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 1 3 4</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 1 3 4</span> |
| |
| <span class="sd"> >>> df.append(df, ignore_index=True)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 2</span> |
| <span class="sd"> 1 3 4</span> |
| <span class="sd"> 2 1 2</span> |
| <span class="sd"> 3 3 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"DataFrames.append() does not support appending Series to DataFrames"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">sort</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"The 'sort' parameter is currently not supported"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">ignore_index</span><span class="p">:</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_scols</span><span class="p">)</span> <span class="o">!=</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Both DataFrames have to have the same number of index levels"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">verify_integrity</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_scols</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">index_scols</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">intersect</span><span class="p">(</span> |
| <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">count</span><span class="p">()</span> |
| <span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Indices have overlapping values"</span><span class="p">)</span> |
| |
| <span class="c1"># Lazy import to avoid circular dependency issues</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.namespace</span> <span class="kn">import</span> <span class="n">concat</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">concat</span><span class="p">([</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">],</span> <span class="n">ignore_index</span><span class="o">=</span><span class="n">ignore_index</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add 'filter_func' and 'errors' parameter</span> |
| <div class="viewcode-block" id="DataFrame.update"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.update.html#pyspark.pandas.DataFrame.update">[docs]</a> <span class="k">def</span> <span class="nf">update</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> <span class="n">join</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"left"</span><span class="p">,</span> <span class="n">overwrite</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Modify in place using non-NA values from another DataFrame.</span> |
| <span class="sd"> Aligns on indices. There is no return value.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : DataFrame, or Series</span> |
| <span class="sd"> join : 'left', default 'left'</span> |
| <span class="sd"> Only left join is implemented, keeping the index and columns of the original object.</span> |
| <span class="sd"> overwrite : bool, default True</span> |
| <span class="sd"> How to handle non-NA values for overlapping keys:</span> |
| |
| <span class="sd"> * True: overwrite original DataFrame's values with values from `other`.</span> |
| <span class="sd"> * False: only update values that are NA in the original DataFrame.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> None : method directly changes calling object</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.merge : For column(s)-on-columns(s) operations.</span> |
| <span class="sd"> DataFrame.join : Join columns of another DataFrame.</span> |
| <span class="sd"> DataFrame.hint : Specifies some hint on the current DataFrame.</span> |
| <span class="sd"> broadcast : Marks a DataFrame as small enough for use in broadcast joins.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> new_df = ps.DataFrame({'B': [4, 5, 6], 'C': [7, 8, 9]}, columns=['B', 'C'])</span> |
| <span class="sd"> >>> df.update(new_df)</span> |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 5</span> |
| <span class="sd"> 2 3 6</span> |
| |
| <span class="sd"> The DataFrame's length does not increase as a result of the update,</span> |
| <span class="sd"> only values at matching index/column labels are updated.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': ['a', 'b', 'c'], 'B': ['x', 'y', 'z']}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> new_df = ps.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}, columns=['B'])</span> |
| <span class="sd"> >>> df.update(new_df)</span> |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 a d</span> |
| <span class="sd"> 1 b e</span> |
| <span class="sd"> 2 c f</span> |
| |
| <span class="sd"> For Series, it's name attribute must be set.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': ['a', 'b', 'c'], 'B': ['x', 'y', 'z']}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> new_column = ps.Series(['d', 'e'], name='B', index=[0, 2])</span> |
| <span class="sd"> >>> df.update(new_column)</span> |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 a d</span> |
| <span class="sd"> 1 b y</span> |
| <span class="sd"> 2 c e</span> |
| |
| <span class="sd"> If `other` contains None the corresponding values are not updated in the original dataframe.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> new_df = ps.DataFrame({'B': [4, None, 6]}, columns=['B'])</span> |
| <span class="sd"> >>> df.update(new_df)</span> |
| <span class="sd"> >>> df.sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 4.0</span> |
| <span class="sd"> 1 2 500.0</span> |
| <span class="sd"> 2 3 6.0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">join</span> <span class="o">!=</span> <span class="s2">"left"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"Only left join is supported"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">other</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| |
| <span class="n">update_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> |
| <span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">update_sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> |
| <span class="n">other</span><span class="p">[</span><span class="n">update_columns</span><span class="p">],</span> <span class="n">rsuffix</span><span class="o">=</span><span class="s2">"_new"</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">column_labels</span> <span class="ow">in</span> <span class="n">update_columns</span><span class="p">:</span> |
| <span class="n">column_name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="n">old_col</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">update_sdf</span><span class="p">,</span> <span class="n">column_name</span><span class="p">)</span> |
| <span class="n">new_col</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span> |
| <span class="n">update_sdf</span><span class="p">,</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">+</span> <span class="s2">"_new"</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">overwrite</span><span class="p">:</span> |
| <span class="n">update_sdf</span> <span class="o">=</span> <span class="n">update_sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">column_name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">new_col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">old_col</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">new_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">update_sdf</span> <span class="o">=</span> <span class="n">update_sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">column_name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">old_col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">new_col</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">old_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">data_fields</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">update_sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">update_sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_names</span><span class="p">],</span> |
| <span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">,</span> <span class="n">requires_same_anchor</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sample"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.sample.html#pyspark.pandas.DataFrame.sample">[docs]</a> <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">n</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">frac</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">replace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">random_state</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a random sample of items from an axis of object.</span> |
| |
| <span class="sd"> Please call this function using named argument by specifying the ``frac`` argument.</span> |
| |
| <span class="sd"> You can use `random_state` for reproducibility. However, note that different from pandas,</span> |
| <span class="sd"> specifying a seed in pandas-on-Spark/Spark does not guarantee the sampled rows will</span> |
| <span class="sd"> be fixed. The result set depends on not only the seed, but also how the data is distributed</span> |
| <span class="sd"> across machines and to some extent network randomness when shuffle operations are involved.</span> |
| <span class="sd"> Even in the simplest case, the result set will depend on the system's CPU core count.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, optional</span> |
| <span class="sd"> Number of items to return. This is currently NOT supported. Use frac instead.</span> |
| <span class="sd"> frac : float, optional</span> |
| <span class="sd"> Fraction of axis items to return.</span> |
| <span class="sd"> replace : bool, default False</span> |
| <span class="sd"> Sample with or without replacement.</span> |
| <span class="sd"> random_state : int, optional</span> |
| <span class="sd"> Seed for the random number generator (if int).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> A new object of same type as caller containing the sampled items.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'num_legs': [2, 4, 8, 0],</span> |
| <span class="sd"> ... 'num_wings': [2, 0, 0, 0],</span> |
| <span class="sd"> ... 'num_specimen_seen': [10, 2, 1, 8]},</span> |
| <span class="sd"> ... index=['falcon', 'dog', 'spider', 'fish'],</span> |
| <span class="sd"> ... columns=['num_legs', 'num_wings', 'num_specimen_seen'])</span> |
| <span class="sd"> >>> df # doctest: +SKIP</span> |
| <span class="sd"> num_legs num_wings num_specimen_seen</span> |
| <span class="sd"> falcon 2 2 10</span> |
| <span class="sd"> dog 4 0 2</span> |
| <span class="sd"> spider 8 0 1</span> |
| <span class="sd"> fish 0 0 8</span> |
| |
| <span class="sd"> A random 25% sample of the ``DataFrame``.</span> |
| <span class="sd"> Note that we use `random_state` to ensure the reproducibility of</span> |
| <span class="sd"> the examples.</span> |
| |
| <span class="sd"> >>> df.sample(frac=0.25, random_state=1) # doctest: +SKIP</span> |
| <span class="sd"> num_legs num_wings num_specimen_seen</span> |
| <span class="sd"> falcon 2 2 10</span> |
| <span class="sd"> fish 0 0 8</span> |
| |
| <span class="sd"> Extract 25% random elements from the ``Series`` ``df['num_legs']``, with replacement,</span> |
| <span class="sd"> so the same items could appear more than once.</span> |
| |
| <span class="sd"> >>> df['num_legs'].sample(frac=0.4, replace=True, random_state=1) # doctest: +SKIP</span> |
| <span class="sd"> falcon 2</span> |
| <span class="sd"> spider 8</span> |
| <span class="sd"> spider 8</span> |
| <span class="sd"> Name: num_legs, dtype: int64</span> |
| |
| <span class="sd"> Specifying the exact number of items to return is not supported at the moment.</span> |
| |
| <span class="sd"> >>> df.sample(n=5) # doctest: +ELLIPSIS</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> NotImplementedError: Function sample currently does not support specifying ...</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Note: we don't run any of the doctests because the result can change depending on the</span> |
| <span class="c1"># system's core count.</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"Function sample currently does not support specifying "</span> |
| <span class="s2">"exact number of items to return. Use frac instead."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">frac</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"frac must be specified."</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span> |
| <span class="n">withReplacement</span><span class="o">=</span><span class="n">replace</span><span class="p">,</span> <span class="n">fraction</span><span class="o">=</span><span class="n">frac</span><span class="p">,</span> <span class="n">seed</span><span class="o">=</span><span class="n">random_state</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.astype"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.astype.html#pyspark.pandas.DataFrame.astype">[docs]</a> <span class="k">def</span> <span class="nf">astype</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dtype</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Cast a pandas-on-Spark object to a specified dtype ``dtype``.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dtype : data type, or dict of column name -> data type</span> |
| <span class="sd"> Use a numpy.dtype or Python type to cast entire pandas-on-Spark object to</span> |
| <span class="sd"> the same type. Alternatively, use {col: dtype, ...}, where col is a</span> |
| <span class="sd"> column label and dtype is a numpy.dtype or Python type to cast one</span> |
| <span class="sd"> or more of the DataFrame's columns to column-specific types.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> casted : same type as caller</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> to_datetime : Convert argument to datetime.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}, dtype='int64')</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 2 2</span> |
| <span class="sd"> 2 3 3</span> |
| |
| <span class="sd"> Convert to float type:</span> |
| |
| <span class="sd"> >>> df.astype('float')</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1.0 1.0</span> |
| <span class="sd"> 1 2.0 2.0</span> |
| <span class="sd"> 2 3.0 3.0</span> |
| |
| <span class="sd"> Convert to int64 type back:</span> |
| |
| <span class="sd"> >>> df.astype('int64')</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 2 2</span> |
| <span class="sd"> 2 3 3</span> |
| |
| <span class="sd"> Convert column a to float type:</span> |
| |
| <span class="sd"> >>> df.astype({'a': float})</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1.0 1</span> |
| <span class="sd"> 1 2.0 2</span> |
| <span class="sd"> 2 3.0 3</span> |
| |
| <span class="sd"> """</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">if</span> <span class="n">is_dict_like</span><span class="p">(</span><span class="n">dtype</span><span class="p">):</span> |
| <span class="n">dtype_dict</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]],</span> <span class="n">dtype</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">col_name</span> <span class="ow">in</span> <span class="n">dtype_dict</span><span class="o">.</span><span class="n">keys</span><span class="p">():</span> |
| <span class="k">if</span> <span class="n">col_name</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Only a column name can be used for the "</span> |
| <span class="s2">"key in a dtype mappings argument."</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">col_name</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="k">if</span> <span class="n">col_name</span> <span class="ow">in</span> <span class="n">dtype_dict</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype_dict</span><span class="p">[</span><span class="n">col_name</span><span class="p">]))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">col_name</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.add_prefix"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.add_prefix.html#pyspark.pandas.DataFrame.add_prefix">[docs]</a> <span class="k">def</span> <span class="nf">add_prefix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prefix</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Prefix labels with string `prefix`.</span> |
| |
| <span class="sd"> For Series, the row labels are prefixed.</span> |
| <span class="sd"> For DataFrame, the column labels are prefixed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> prefix : str</span> |
| <span class="sd"> The string to add before each label.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> New DataFrame with updated labels.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.add_prefix: Prefix row labels with string `prefix`.</span> |
| <span class="sd"> Series.add_suffix: Suffix row labels with string `suffix`.</span> |
| <span class="sd"> DataFrame.add_suffix: Suffix column labels with string `suffix`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| <span class="sd"> 2 3 5</span> |
| <span class="sd"> 3 4 6</span> |
| |
| <span class="sd"> >>> df.add_prefix('col_')</span> |
| <span class="sd"> col_A col_B</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| <span class="sd"> 2 3 5</span> |
| <span class="sd"> 3 4 6</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="nb">tuple</span><span class="p">([</span><span class="n">prefix</span> <span class="o">+</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">]))</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.add_suffix"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.add_suffix.html#pyspark.pandas.DataFrame.add_suffix">[docs]</a> <span class="k">def</span> <span class="nf">add_suffix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">suffix</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Suffix labels with string `suffix`.</span> |
| |
| <span class="sd"> For Series, the row labels are suffixed.</span> |
| <span class="sd"> For DataFrame, the column labels are suffixed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> suffix : str</span> |
| <span class="sd"> The string to add before each label.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> New DataFrame with updated labels.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.add_prefix: Prefix row labels with string `prefix`.</span> |
| <span class="sd"> Series.add_suffix: Suffix row labels with string `suffix`.</span> |
| <span class="sd"> DataFrame.add_prefix: Prefix column labels with string `prefix`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| <span class="sd"> 2 3 5</span> |
| <span class="sd"> 3 4 6</span> |
| |
| <span class="sd"> >>> df.add_suffix('_col')</span> |
| <span class="sd"> A_col B_col</span> |
| <span class="sd"> 0 1 3</span> |
| <span class="sd"> 1 2 4</span> |
| <span class="sd"> 2 3 5</span> |
| <span class="sd"> 3 4 6</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">suffix</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="nb">tuple</span><span class="p">([</span><span class="n">i</span> <span class="o">+</span> <span class="n">suffix</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">]))</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: include, and exclude should be implemented.</span> |
| <div class="viewcode-block" id="DataFrame.describe"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.describe.html#pyspark.pandas.DataFrame.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">percentiles</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Generate descriptive statistics that summarize the central tendency,</span> |
| <span class="sd"> dispersion and shape of a dataset's distribution, excluding</span> |
| <span class="sd"> ``NaN`` values.</span> |
| |
| <span class="sd"> Analyzes both numeric and object series, as well</span> |
| <span class="sd"> as ``DataFrame`` column sets of mixed data types. The output</span> |
| <span class="sd"> will vary depending on what is provided. Refer to the notes</span> |
| <span class="sd"> below for more detail.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> percentiles : list of ``float`` in range [0.0, 1.0], default [0.25, 0.5, 0.75]</span> |
| <span class="sd"> A list of percentiles to be computed.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Summary statistics of the Dataframe provided.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.count: Count number of non-NA/null observations.</span> |
| <span class="sd"> DataFrame.max: Maximum of the values in the object.</span> |
| <span class="sd"> DataFrame.min: Minimum of the values in the object.</span> |
| <span class="sd"> DataFrame.mean: Mean of the values.</span> |
| <span class="sd"> DataFrame.std: Standard deviation of the observations.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> For numeric data, the result's index will include ``count``,</span> |
| <span class="sd"> ``mean``, ``std``, ``min``, ``25%``, ``50%``, ``75%``, ``max``.</span> |
| |
| <span class="sd"> Currently only numeric data is supported.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Describing a numeric ``Series``.</span> |
| |
| <span class="sd"> >>> s = ps.Series([1, 2, 3])</span> |
| <span class="sd"> >>> s.describe()</span> |
| <span class="sd"> count 3.0</span> |
| <span class="sd"> mean 2.0</span> |
| <span class="sd"> std 1.0</span> |
| <span class="sd"> min 1.0</span> |
| <span class="sd"> 25% 1.0</span> |
| <span class="sd"> 50% 2.0</span> |
| <span class="sd"> 75% 3.0</span> |
| <span class="sd"> max 3.0</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> Describing a ``DataFrame``. Only numeric fields are returned.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'numeric1': [1, 2, 3],</span> |
| <span class="sd"> ... 'numeric2': [4.0, 5.0, 6.0],</span> |
| <span class="sd"> ... 'object': ['a', 'b', 'c']</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['numeric1', 'numeric2', 'object'])</span> |
| <span class="sd"> >>> df.describe()</span> |
| <span class="sd"> numeric1 numeric2</span> |
| <span class="sd"> count 3.0 3.0</span> |
| <span class="sd"> mean 2.0 5.0</span> |
| <span class="sd"> std 1.0 1.0</span> |
| <span class="sd"> min 1.0 4.0</span> |
| <span class="sd"> 25% 1.0 4.0</span> |
| <span class="sd"> 50% 2.0 5.0</span> |
| <span class="sd"> 75% 3.0 6.0</span> |
| <span class="sd"> max 3.0 6.0</span> |
| |
| <span class="sd"> For multi-index columns:</span> |
| |
| <span class="sd"> >>> df.columns = [('num', 'a'), ('num', 'b'), ('obj', 'c')]</span> |
| <span class="sd"> >>> df.describe() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> count 3.0 3.0</span> |
| <span class="sd"> mean 2.0 5.0</span> |
| <span class="sd"> std 1.0 1.0</span> |
| <span class="sd"> min 1.0 4.0</span> |
| <span class="sd"> 25% 1.0 4.0</span> |
| <span class="sd"> 50% 2.0 5.0</span> |
| <span class="sd"> 75% 3.0 6.0</span> |
| <span class="sd"> max 3.0 6.0</span> |
| |
| <span class="sd"> >>> df[('num', 'b')].describe()</span> |
| <span class="sd"> count 3.0</span> |
| <span class="sd"> mean 5.0</span> |
| <span class="sd"> std 1.0</span> |
| <span class="sd"> min 4.0</span> |
| <span class="sd"> 25% 4.0</span> |
| <span class="sd"> 50% 5.0</span> |
| <span class="sd"> 75% 6.0</span> |
| <span class="sd"> max 6.0</span> |
| <span class="sd"> Name: (num, b), dtype: float64</span> |
| |
| <span class="sd"> Describing a ``DataFrame`` and selecting custom percentiles.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'numeric1': [1, 2, 3],</span> |
| <span class="sd"> ... 'numeric2': [4.0, 5.0, 6.0]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['numeric1', 'numeric2'])</span> |
| <span class="sd"> >>> df.describe(percentiles = [0.85, 0.15])</span> |
| <span class="sd"> numeric1 numeric2</span> |
| <span class="sd"> count 3.0 3.0</span> |
| <span class="sd"> mean 2.0 5.0</span> |
| <span class="sd"> std 1.0 1.0</span> |
| <span class="sd"> min 1.0 4.0</span> |
| <span class="sd"> 15% 1.0 4.0</span> |
| <span class="sd"> 50% 2.0 5.0</span> |
| <span class="sd"> 85% 3.0 6.0</span> |
| <span class="sd"> max 3.0 6.0</span> |
| |
| <span class="sd"> Describing a column from a ``DataFrame`` by accessing it as</span> |
| <span class="sd"> an attribute.</span> |
| |
| <span class="sd"> >>> df.numeric1.describe()</span> |
| <span class="sd"> count 3.0</span> |
| <span class="sd"> mean 2.0</span> |
| <span class="sd"> std 1.0</span> |
| <span class="sd"> min 1.0</span> |
| <span class="sd"> 25% 1.0</span> |
| <span class="sd"> 50% 2.0</span> |
| <span class="sd"> 75% 3.0</span> |
| <span class="sd"> max 3.0</span> |
| <span class="sd"> Name: numeric1, dtype: float64</span> |
| |
| <span class="sd"> Describing a column from a ``DataFrame`` by accessing it as</span> |
| <span class="sd"> an attribute and selecting custom percentiles.</span> |
| |
| <span class="sd"> >>> df.numeric1.describe(percentiles = [0.85, 0.15])</span> |
| <span class="sd"> count 3.0</span> |
| <span class="sd"> mean 2.0</span> |
| <span class="sd"> std 1.0</span> |
| <span class="sd"> min 1.0</span> |
| <span class="sd"> 15% 1.0</span> |
| <span class="sd"> 50% 2.0</span> |
| <span class="sd"> 85% 3.0</span> |
| <span class="sd"> max 3.0</span> |
| <span class="sd"> Name: numeric1, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="n">exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">exprs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Cannot describe a DataFrame without columns"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">percentiles</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">((</span><span class="n">p</span> <span class="o"><</span> <span class="mf">0.0</span><span class="p">)</span> <span class="ow">or</span> <span class="p">(</span><span class="n">p</span> <span class="o">></span> <span class="mf">1.0</span><span class="p">)</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">percentiles</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Percentiles should all be in the interval [0, 1]"</span><span class="p">)</span> |
| <span class="c1"># appending 50% if not in percentiles already</span> |
| <span class="n">percentiles</span> <span class="o">=</span> <span class="p">(</span><span class="n">percentiles</span> <span class="o">+</span> <span class="p">[</span><span class="mf">0.5</span><span class="p">])</span> <span class="k">if</span> <span class="mf">0.5</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">percentiles</span> <span class="k">else</span> <span class="n">percentiles</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">percentiles</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.25</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.75</span><span class="p">]</span> |
| |
| <span class="n">formatted_perc</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"</span><span class="si">{:.0%}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">p</span><span class="p">)</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">percentiles</span><span class="p">)]</span> |
| <span class="n">stats</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"count"</span><span class="p">,</span> <span class="s2">"mean"</span><span class="p">,</span> <span class="s2">"stddev"</span><span class="p">,</span> <span class="s2">"min"</span><span class="p">,</span> <span class="o">*</span><span class="n">formatted_perc</span><span class="p">,</span> <span class="s2">"max"</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span><span class="o">.</span><span class="n">summary</span><span class="p">(</span><span class="o">*</span><span class="n">stats</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">"stddev"</span><span class="p">,</span> <span class="s2">"std"</span><span class="p">,</span> <span class="n">subset</span><span class="o">=</span><span class="p">[</span><span class="s2">"summary"</span><span class="p">])</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"summary"</span><span class="p">)],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s2">"float64"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.drop_duplicates"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.drop_duplicates.html#pyspark.pandas.DataFrame.drop_duplicates">[docs]</a> <span class="k">def</span> <span class="nf">drop_duplicates</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">keep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"first"</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return DataFrame with duplicate rows removed, optionally only</span> |
| <span class="sd"> considering certain columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> subset : column label or sequence of labels, optional</span> |
| <span class="sd"> Only consider certain columns for identifying duplicates, by</span> |
| <span class="sd"> default use all of the columns.</span> |
| <span class="sd"> keep : {'first', 'last', False}, default 'first'</span> |
| <span class="sd"> Determines which duplicates (if any) to keep.</span> |
| <span class="sd"> - ``first`` : Drop duplicates except for the first occurrence.</span> |
| <span class="sd"> - ``last`` : Drop duplicates except for the last occurrence.</span> |
| <span class="sd"> - False : Drop all duplicates.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Whether to drop duplicates in place or to return a copy.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with duplicates removed or None if ``inplace=True``.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... {'a': [1, 2, 2, 2, 3], 'b': ['a', 'a', 'a', 'c', 'd']}, columns = ['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 1 2 a</span> |
| <span class="sd"> 2 2 a</span> |
| <span class="sd"> 3 2 c</span> |
| <span class="sd"> 4 3 d</span> |
| |
| <span class="sd"> >>> df.drop_duplicates().sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 1 2 a</span> |
| <span class="sd"> 3 2 c</span> |
| <span class="sd"> 4 3 d</span> |
| |
| <span class="sd"> >>> df.drop_duplicates('a').sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 1 2 a</span> |
| <span class="sd"> 4 3 d</span> |
| |
| <span class="sd"> >>> df.drop_duplicates(['a', 'b']).sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 1 2 a</span> |
| <span class="sd"> 3 2 c</span> |
| <span class="sd"> 4 3 d</span> |
| |
| <span class="sd"> >>> df.drop_duplicates(keep='last').sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 2 2 a</span> |
| <span class="sd"> 3 2 c</span> |
| <span class="sd"> 4 3 d</span> |
| |
| <span class="sd"> >>> df.drop_duplicates(keep=False).sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 a</span> |
| <span class="sd"> 3 2 c</span> |
| <span class="sd"> 4 3 d</span> |
| <span class="sd"> """</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| |
| <span class="n">sdf</span><span class="p">,</span> <span class="n">column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_mark_duplicates</span><span class="p">(</span><span class="n">subset</span><span class="p">,</span> <span class="n">keep</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="o">~</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">column</span><span class="p">))</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.reindex"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.reindex.html#pyspark.pandas.DataFrame.reindex">[docs]</a> <span class="k">def</span> <span class="nf">reindex</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">labels</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"Index"</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">copy</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Conform DataFrame to new index with optional filling logic, placing</span> |
| <span class="sd"> NA/NaN in locations having no value in the previous index. A new object</span> |
| <span class="sd"> is produced unless the new index is equivalent to the current one and</span> |
| <span class="sd"> ``copy=False``.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> labels: array-like, optional</span> |
| <span class="sd"> New labels / index to conform the axis specified by ‘axis’ to.</span> |
| <span class="sd"> index, columns: array-like, optional</span> |
| <span class="sd"> New labels / index to conform to, should be specified using keywords.</span> |
| <span class="sd"> Preferably an Index object to avoid duplicating data</span> |
| <span class="sd"> axis: int or str, optional</span> |
| <span class="sd"> Axis to target. Can be either the axis name (‘index’, ‘columns’) or</span> |
| <span class="sd"> number (0, 1).</span> |
| <span class="sd"> copy : bool, default True</span> |
| <span class="sd"> Return a new object, even if the passed indexes are the same.</span> |
| <span class="sd"> fill_value : scalar, default np.NaN</span> |
| <span class="sd"> Value to use for missing values. Defaults to NaN, but can be any</span> |
| <span class="sd"> "compatible" value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame with changed index.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.set_index : Set row labels.</span> |
| <span class="sd"> DataFrame.reset_index : Remove row labels or move them to new columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> ``DataFrame.reindex`` supports two calling conventions</span> |
| |
| <span class="sd"> * ``(index=index_labels, columns=column_labels, ...)``</span> |
| <span class="sd"> * ``(labels, axis={'index', 'columns'}, ...)``</span> |
| |
| <span class="sd"> We *highly* recommend using keyword arguments to clarify your</span> |
| <span class="sd"> intent.</span> |
| |
| <span class="sd"> Create a dataframe with some fictional data.</span> |
| |
| <span class="sd"> >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'http_status': [200, 200, 404, 404, 301],</span> |
| <span class="sd"> ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},</span> |
| <span class="sd"> ... index=index,</span> |
| <span class="sd"> ... columns=['http_status', 'response_time'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> http_status response_time</span> |
| <span class="sd"> Firefox 200 0.04</span> |
| <span class="sd"> Chrome 200 0.02</span> |
| <span class="sd"> Safari 404 0.07</span> |
| <span class="sd"> IE10 404 0.08</span> |
| <span class="sd"> Konqueror 301 1.00</span> |
| |
| <span class="sd"> Create a new index and reindex the dataframe. By default</span> |
| <span class="sd"> values in the new index that do not have corresponding</span> |
| <span class="sd"> records in the dataframe are assigned ``NaN``.</span> |
| |
| <span class="sd"> >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',</span> |
| <span class="sd"> ... 'Chrome']</span> |
| <span class="sd"> >>> df.reindex(new_index).sort_index()</span> |
| <span class="sd"> http_status response_time</span> |
| <span class="sd"> Chrome 200.0 0.02</span> |
| <span class="sd"> Comodo Dragon NaN NaN</span> |
| <span class="sd"> IE10 404.0 0.08</span> |
| <span class="sd"> Iceweasel NaN NaN</span> |
| <span class="sd"> Safari 404.0 0.07</span> |
| |
| <span class="sd"> We can fill in the missing values by passing a value to</span> |
| <span class="sd"> the keyword ``fill_value``.</span> |
| |
| <span class="sd"> >>> df.reindex(new_index, fill_value=0, copy=False).sort_index()</span> |
| <span class="sd"> http_status response_time</span> |
| <span class="sd"> Chrome 200 0.02</span> |
| <span class="sd"> Comodo Dragon 0 0.00</span> |
| <span class="sd"> IE10 404 0.08</span> |
| <span class="sd"> Iceweasel 0 0.00</span> |
| <span class="sd"> Safari 404 0.07</span> |
| |
| <span class="sd"> We can also reindex the columns.</span> |
| |
| <span class="sd"> >>> df.reindex(columns=['http_status', 'user_agent']).sort_index()</span> |
| <span class="sd"> http_status user_agent</span> |
| <span class="sd"> Chrome 200 NaN</span> |
| <span class="sd"> Firefox 200 NaN</span> |
| <span class="sd"> IE10 404 NaN</span> |
| <span class="sd"> Konqueror 301 NaN</span> |
| <span class="sd"> Safari 404 NaN</span> |
| |
| <span class="sd"> Or we can use "axis-style" keyword arguments</span> |
| |
| <span class="sd"> >>> df.reindex(['http_status', 'user_agent'], axis="columns").sort_index()</span> |
| <span class="sd"> http_status user_agent</span> |
| <span class="sd"> Chrome 200 NaN</span> |
| <span class="sd"> Firefox 200 NaN</span> |
| <span class="sd"> IE10 404 NaN</span> |
| <span class="sd"> Konqueror 301 NaN</span> |
| <span class="sd"> Safari 404 NaN</span> |
| |
| <span class="sd"> To further illustrate the filling functionality in</span> |
| <span class="sd"> ``reindex``, we will create a dataframe with a</span> |
| <span class="sd"> monotonically increasing index (for example, a sequence</span> |
| <span class="sd"> of dates).</span> |
| |
| <span class="sd"> >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},</span> |
| <span class="sd"> ... index=date_index)</span> |
| <span class="sd"> >>> df2.sort_index()</span> |
| <span class="sd"> prices</span> |
| <span class="sd"> 2010-01-01 100.0</span> |
| <span class="sd"> 2010-01-02 101.0</span> |
| <span class="sd"> 2010-01-03 NaN</span> |
| <span class="sd"> 2010-01-04 100.0</span> |
| <span class="sd"> 2010-01-05 89.0</span> |
| <span class="sd"> 2010-01-06 88.0</span> |
| |
| <span class="sd"> Suppose we decide to expand the dataframe to cover a wider</span> |
| <span class="sd"> date range.</span> |
| |
| <span class="sd"> >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')</span> |
| <span class="sd"> >>> df2.reindex(date_index2).sort_index()</span> |
| <span class="sd"> prices</span> |
| <span class="sd"> 2009-12-29 NaN</span> |
| <span class="sd"> 2009-12-30 NaN</span> |
| <span class="sd"> 2009-12-31 NaN</span> |
| <span class="sd"> 2010-01-01 100.0</span> |
| <span class="sd"> 2010-01-02 101.0</span> |
| <span class="sd"> 2010-01-03 NaN</span> |
| <span class="sd"> 2010-01-04 100.0</span> |
| <span class="sd"> 2010-01-05 89.0</span> |
| <span class="sd"> 2010-01-06 88.0</span> |
| <span class="sd"> 2010-01-07 NaN</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="p">(</span><span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Cannot specify both 'axis' and any of 'index' or 'columns'."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">labels</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="n">labels</span> |
| <span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">labels</span> |
| |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">index</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Index must be called with a collection of some kind, "</span> |
| <span class="s2">"</span><span class="si">%s</span><span class="s2"> was passed"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Columns must be called with a collection of some kind, "</span> |
| <span class="s2">"</span><span class="si">%s</span><span class="s2"> was passed"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">df</span> <span class="o">=</span> <span class="bp">self</span> |
| |
| <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">_reindex_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">_reindex_columns</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">)</span> |
| |
| <span class="c1"># Copy</span> |
| <span class="k">if</span> <span class="n">copy</span> <span class="ow">and</span> <span class="n">df</span> <span class="ow">is</span> <span class="bp">self</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">df</span></div> |
| |
| <span class="k">def</span> <span class="nf">_reindex_index</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"Index"</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]],</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="c1"># When axis is index, we can mimic pandas' by a right outer join.</span> |
| <span class="n">nlevels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> |
| <span class="k">assert</span> <span class="n">nlevels</span> <span class="o"><=</span> <span class="mi">1</span> <span class="ow">or</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">)</span> <span class="ow">and</span> <span class="n">nlevels</span> <span class="o">==</span> <span class="n">index</span><span class="o">.</span><span class="n">nlevels</span> |
| <span class="p">),</span> <span class="s2">"MultiIndex DataFrame can only be reindexed with a similar pandas-on-Spark MultiIndex."</span> |
| |
| <span class="n">index_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">frame</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Index</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">nlevels</span> <span class="o">!=</span> <span class="n">index</span><span class="o">.</span><span class="n">nlevels</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">index</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">([]))</span><span class="o">.</span><span class="n">reindex</span><span class="p">(</span> |
| <span class="n">columns</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">fill_value</span><span class="o">=</span><span class="n">fill_value</span> |
| <span class="p">)</span> |
| |
| <span class="n">index_names</span> <span class="o">=</span> <span class="n">index</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="n">scols</span> <span class="o">=</span> <span class="n">index</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="n">index</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_column</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">index_column</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">scols</span><span class="p">,</span> <span class="n">index_columns</span><span class="p">)]</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">index</span><span class="p">))</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="n">index</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">index</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| |
| <span class="k">if</span> <span class="n">fill_value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">frame_index_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="s2">"__frame_index_column_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">nlevels</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">frame_index_col</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index_col</span><span class="p">,</span> <span class="n">frame_index_col</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">index_columns</span><span class="p">,</span> <span class="n">frame_index_columns</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">scols</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| <span class="n">frame</span> <span class="o">=</span> <span class="n">frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">index_scols</span> <span class="o">+</span> <span class="n">scols</span><span class="p">)</span> |
| |
| <span class="n">temp_fill_value</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="s2">"__fill_value__"</span><span class="p">)</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="n">labels</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_fill_value</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">fill_value</span><span class="p">))</span> |
| |
| <span class="n">frame_index_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">frame_index_columns</span><span class="p">]</span> |
| <span class="n">labels_index_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">labels</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_columns</span><span class="p">]</span> |
| |
| <span class="n">joined_df</span> <span class="o">=</span> <span class="n">frame</span><span class="o">.</span><span class="n">join</span><span class="p">(</span> |
| <span class="n">labels</span><span class="p">,</span> |
| <span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="n">fcol</span> <span class="o">==</span> <span class="n">lcol</span> <span class="k">for</span> <span class="n">fcol</span><span class="p">,</span> <span class="n">lcol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">frame_index_scols</span><span class="p">,</span> <span class="n">labels_index_scols</span><span class="p">)],</span> |
| <span class="n">how</span><span class="o">=</span><span class="s2">"right"</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">joined_df</span> <span class="o">=</span> <span class="n">joined_df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">labels_index_scols</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">reduce</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">c1</span><span class="p">,</span> <span class="n">c2</span><span class="p">:</span> <span class="n">c1</span> <span class="o">&</span> <span class="n">c2</span><span class="p">,</span> |
| <span class="p">[</span> |
| <span class="n">fcol</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> <span class="o">&</span> <span class="n">lcol</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">fcol</span><span class="p">,</span> <span class="n">lcol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">frame_index_scols</span><span class="p">,</span> <span class="n">labels_index_scols</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="p">),</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">joined_df</span><span class="p">,</span> <span class="n">temp_fill_value</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">joined_df</span><span class="p">,</span> <span class="n">col</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">joined_df</span> <span class="o">=</span> <span class="n">frame</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">labels</span><span class="p">,</span> <span class="n">on</span><span class="o">=</span><span class="n">index_columns</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">"right"</span><span class="p">)</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">joined_df</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">field</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="n">index</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_reindex_columns</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]],</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">label_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">label_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Expected tuple, got </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">label_columns</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">label_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">!=</span> <span class="n">level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"shape (1,</span><span class="si">{}</span><span class="s2">) doesn't match the shape (1,</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">level</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">fill_value</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span> <span class="k">if</span> <span class="n">fill_value</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">fill_value</span> |
| <span class="n">scols_or_pssers</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># type: List[Union[Series, Column]]</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">label_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">scols_or_pssers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">scols_or_pssers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">fill_value</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span> |
| <span class="n">labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">):</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">name</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">columns</span><span class="o">.</span><span class="n">names</span> |
| <span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="n">scols_or_pssers</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="n">labels</span><span class="p">,</span> <span class="n">column_label_names</span><span class="o">=</span><span class="n">column_label_names</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">scols_or_pssers</span><span class="p">,</span> <span class="n">column_labels</span><span class="o">=</span><span class="n">labels</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="DataFrame.reindex_like"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.reindex_like.html#pyspark.pandas.DataFrame.reindex_like">[docs]</a> <span class="k">def</span> <span class="nf">reindex_like</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> <span class="n">copy</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a DataFrame with matching indices as other object.</span> |
| |
| <span class="sd"> Conform the object to the same index on all axes. Places NA/NaN in locations</span> |
| <span class="sd"> having no value in the previous index. A new object is produced unless the</span> |
| <span class="sd"> new index is equivalent to the current one and copy=False.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : DataFrame</span> |
| <span class="sd"> Its row and column indices are used to define the new indices</span> |
| <span class="sd"> of this object.</span> |
| <span class="sd"> copy : bool, default True</span> |
| <span class="sd"> Return a new object, even if the passed indexes are the same.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with changed indices on each axis.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.set_index : Set row labels.</span> |
| <span class="sd"> DataFrame.reset_index : Remove row labels or move them to new columns.</span> |
| <span class="sd"> DataFrame.reindex : Change to new indices or expand indices.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Same as calling</span> |
| <span class="sd"> ``.reindex(index=other.index, columns=other.columns,...)``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df1 = ps.DataFrame([[24.3, 75.7, 'high'],</span> |
| <span class="sd"> ... [31, 87.8, 'high'],</span> |
| <span class="sd"> ... [22, 71.6, 'medium'],</span> |
| <span class="sd"> ... [35, 95, 'medium']],</span> |
| <span class="sd"> ... columns=['temp_celsius', 'temp_fahrenheit',</span> |
| <span class="sd"> ... 'windspeed'],</span> |
| <span class="sd"> ... index=pd.date_range(start='2014-02-12',</span> |
| <span class="sd"> ... end='2014-02-15', freq='D'))</span> |
| <span class="sd"> >>> df1</span> |
| <span class="sd"> temp_celsius temp_fahrenheit windspeed</span> |
| <span class="sd"> 2014-02-12 24.3 75.7 high</span> |
| <span class="sd"> 2014-02-13 31.0 87.8 high</span> |
| <span class="sd"> 2014-02-14 22.0 71.6 medium</span> |
| <span class="sd"> 2014-02-15 35.0 95.0 medium</span> |
| |
| <span class="sd"> >>> df2 = ps.DataFrame([[28, 'low'],</span> |
| <span class="sd"> ... [30, 'low'],</span> |
| <span class="sd"> ... [35.1, 'medium']],</span> |
| <span class="sd"> ... columns=['temp_celsius', 'windspeed'],</span> |
| <span class="sd"> ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',</span> |
| <span class="sd"> ... '2014-02-15']))</span> |
| <span class="sd"> >>> df2</span> |
| <span class="sd"> temp_celsius windspeed</span> |
| <span class="sd"> 2014-02-12 28.0 low</span> |
| <span class="sd"> 2014-02-13 30.0 low</span> |
| <span class="sd"> 2014-02-15 35.1 medium</span> |
| |
| <span class="sd"> >>> df2.reindex_like(df1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> temp_celsius temp_fahrenheit windspeed</span> |
| <span class="sd"> 2014-02-12 28.0 NaN low</span> |
| <span class="sd"> 2014-02-13 30.0 NaN low</span> |
| <span class="sd"> 2014-02-14 NaN NaN None</span> |
| <span class="sd"> 2014-02-15 35.1 NaN medium</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">reindex</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">other</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="n">copy</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"other must be a pandas-on-Spark DataFrame"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.melt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.melt.html#pyspark.pandas.DataFrame.melt">[docs]</a> <span class="k">def</span> <span class="nf">melt</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">id_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">var_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"value"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Unpivot a DataFrame from wide format to long format, optionally</span> |
| <span class="sd"> leaving identifier variables set.</span> |
| |
| <span class="sd"> This function is useful to massage a DataFrame into a format where one</span> |
| <span class="sd"> or more columns are identifier variables (`id_vars`), while all other</span> |
| <span class="sd"> columns, considered measured variables (`value_vars`), are "unpivoted" to</span> |
| <span class="sd"> the row axis, leaving just two non-identifier columns, 'variable' and</span> |
| <span class="sd"> 'value'.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> frame : DataFrame</span> |
| <span class="sd"> id_vars : tuple, list, or ndarray, optional</span> |
| <span class="sd"> Column(s) to use as identifier variables.</span> |
| <span class="sd"> value_vars : tuple, list, or ndarray, optional</span> |
| <span class="sd"> Column(s) to unpivot. If not specified, uses all columns that</span> |
| <span class="sd"> are not set as `id_vars`.</span> |
| <span class="sd"> var_name : scalar, default 'variable'</span> |
| <span class="sd"> Name to use for the 'variable' column. If None it uses `frame.columns.name` or</span> |
| <span class="sd"> ‘variable’.</span> |
| <span class="sd"> value_name : scalar, default 'value'</span> |
| <span class="sd"> Name to use for the 'value' column.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Unpivoted DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},</span> |
| <span class="sd"> ... 'B': {0: 1, 1: 3, 2: 5},</span> |
| <span class="sd"> ... 'C': {0: 2, 1: 4, 2: 6}},</span> |
| <span class="sd"> ... columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 a 1 2</span> |
| <span class="sd"> 1 b 3 4</span> |
| <span class="sd"> 2 c 5 6</span> |
| |
| <span class="sd"> >>> ps.melt(df)</span> |
| <span class="sd"> variable value</span> |
| <span class="sd"> 0 A a</span> |
| <span class="sd"> 1 B 1</span> |
| <span class="sd"> 2 C 2</span> |
| <span class="sd"> 3 A b</span> |
| <span class="sd"> 4 B 3</span> |
| <span class="sd"> 5 C 4</span> |
| <span class="sd"> 6 A c</span> |
| <span class="sd"> 7 B 5</span> |
| <span class="sd"> 8 C 6</span> |
| |
| <span class="sd"> >>> df.melt(id_vars='A')</span> |
| <span class="sd"> A variable value</span> |
| <span class="sd"> 0 a B 1</span> |
| <span class="sd"> 1 a C 2</span> |
| <span class="sd"> 2 b B 3</span> |
| <span class="sd"> 3 b C 4</span> |
| <span class="sd"> 4 c B 5</span> |
| <span class="sd"> 5 c C 6</span> |
| |
| <span class="sd"> >>> df.melt(value_vars='A')</span> |
| <span class="sd"> variable value</span> |
| <span class="sd"> 0 A a</span> |
| <span class="sd"> 1 A b</span> |
| <span class="sd"> 2 A c</span> |
| |
| <span class="sd"> >>> ps.melt(df, id_vars=['A', 'B'])</span> |
| <span class="sd"> A B variable value</span> |
| <span class="sd"> 0 a 1 C 2</span> |
| <span class="sd"> 1 b 3 C 4</span> |
| <span class="sd"> 2 c 5 C 6</span> |
| |
| <span class="sd"> >>> df.melt(id_vars=['A'], value_vars=['C'])</span> |
| <span class="sd"> A variable value</span> |
| <span class="sd"> 0 a C 2</span> |
| <span class="sd"> 1 b C 4</span> |
| <span class="sd"> 2 c C 6</span> |
| |
| <span class="sd"> The names of 'variable' and 'value' columns can be customized:</span> |
| |
| <span class="sd"> >>> ps.melt(df, id_vars=['A'], value_vars=['B'],</span> |
| <span class="sd"> ... var_name='myVarname', value_name='myValname')</span> |
| <span class="sd"> A myVarname myValname</span> |
| <span class="sd"> 0 a B 1</span> |
| <span class="sd"> 1 b B 3</span> |
| <span class="sd"> 2 c B 5</span> |
| <span class="sd"> """</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| |
| <span class="k">if</span> <span class="n">id_vars</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">id_vars</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">id_vars</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">id_vars</span> <span class="o">=</span> <span class="p">[</span><span class="n">idv</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">idv</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">idv</span><span class="p">,)</span> <span class="k">for</span> <span class="n">idv</span> <span class="ow">in</span> <span class="n">id_vars</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"id_vars must be a list of tuples"</span> <span class="s2">" when columns are a MultiIndex"</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">id_vars</span><span class="p">):</span> |
| <span class="n">id_vars</span> <span class="o">=</span> <span class="p">[(</span><span class="n">id_vars</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">id_vars</span> <span class="o">=</span> <span class="p">[</span><span class="n">idv</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">idv</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">idv</span><span class="p">,)</span> <span class="k">for</span> <span class="n">idv</span> <span class="ow">in</span> <span class="n">id_vars</span><span class="p">]</span> |
| |
| <span class="n">non_existence_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">idv</span> <span class="k">for</span> <span class="n">idv</span> <span class="ow">in</span> <span class="n">id_vars</span> <span class="k">if</span> <span class="n">idv</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">non_existence_col</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">raveled_column_labels</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ravel</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="n">missing</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">nec</span> <span class="k">for</span> <span class="n">nec</span> <span class="ow">in</span> <span class="n">np</span><span class="o">.</span><span class="n">ravel</span><span class="p">(</span><span class="n">non_existence_col</span><span class="p">)</span> <span class="k">if</span> <span class="n">nec</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">raveled_column_labels</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"The following 'id_vars' are not present"</span> |
| <span class="s2">" in the DataFrame: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"None of </span><span class="si">{}</span><span class="s2"> are in the </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">non_existence_col</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">value_vars</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">value_vars</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value_vars</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">value_vars</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">valv</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">valv</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">valv</span><span class="p">,)</span> <span class="k">for</span> <span class="n">valv</span> <span class="ow">in</span> <span class="n">value_vars</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"value_vars must be a list of tuples"</span> <span class="s2">" when columns are a MultiIndex"</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">value_vars</span><span class="p">):</span> |
| <span class="n">value_vars</span> <span class="o">=</span> <span class="p">[(</span><span class="n">value_vars</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">value_vars</span> <span class="o">=</span> <span class="p">[</span><span class="n">valv</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">valv</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">valv</span><span class="p">,)</span> <span class="k">for</span> <span class="n">valv</span> <span class="ow">in</span> <span class="n">value_vars</span><span class="p">]</span> |
| |
| <span class="n">non_existence_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">valv</span> <span class="k">for</span> <span class="n">valv</span> <span class="ow">in</span> <span class="n">value_vars</span> <span class="k">if</span> <span class="n">valv</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">non_existence_col</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">raveled_column_labels</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">ravel</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> |
| <span class="n">missing</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">nec</span> <span class="k">for</span> <span class="n">nec</span> <span class="ow">in</span> <span class="n">np</span><span class="o">.</span><span class="n">ravel</span><span class="p">(</span><span class="n">non_existence_col</span><span class="p">)</span> <span class="k">if</span> <span class="n">nec</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">raveled_column_labels</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"The following 'value_vars' are not present"</span> |
| <span class="s2">" in the DataFrame: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"None of </span><span class="si">{}</span><span class="s2"> are in the </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">non_existence_col</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">value_vars</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">value_vars</span> <span class="o">=</span> <span class="n">column_labels</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">id_vars</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="k">if</span> <span class="n">var_name</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="p">):</span> |
| <span class="n">var_name</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"variable"</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">var_name</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">if</span> <span class="n">name</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="s2">"variable_</span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">var_name</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">var_name</span> <span class="o">=</span> <span class="p">[</span><span class="n">var_name</span><span class="p">]</span> |
| |
| <span class="n">pairs</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">var_name</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">value_name</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">value_vars</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="p">[</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">id_vars</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"pairs.`</span><span class="si">%s</span><span class="s2">`"</span> <span class="o">%</span> <span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">var_name</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"pairs.`</span><span class="si">%s</span><span class="s2">`"</span> <span class="o">%</span> <span class="n">value_name</span><span class="p">)]</span> |
| <span class="p">)</span> |
| <span class="n">exploded_df</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"pairs"</span><span class="p">,</span> <span class="n">pairs</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">exploded_df</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="k">else</span> <span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">),)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">id_vars</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[(</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">var_name</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[(</span><span class="n">value_name</span><span class="p">,)]</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.stack"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.stack.html#pyspark.pandas.DataFrame.stack">[docs]</a> <span class="k">def</span> <span class="nf">stack</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameOrSeries</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Stack the prescribed level(s) from columns to index.</span> |
| |
| <span class="sd"> Return a reshaped DataFrame or Series having a multi-level</span> |
| <span class="sd"> index with one or more new inner-most levels compared to the current</span> |
| <span class="sd"> DataFrame. The new inner-most levels are created by pivoting the</span> |
| <span class="sd"> columns of the current dataframe:</span> |
| |
| <span class="sd"> - if the columns have a single level, the output is a Series;</span> |
| <span class="sd"> - if the columns have multiple levels, the new index</span> |
| <span class="sd"> level(s) is (are) taken from the prescribed level(s) and</span> |
| <span class="sd"> the output is a DataFrame.</span> |
| |
| <span class="sd"> The new index levels are sorted.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| <span class="sd"> Stacked dataframe or series.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.unstack : Unstack prescribed level(s) from index axis</span> |
| <span class="sd"> onto column axis.</span> |
| <span class="sd"> DataFrame.pivot : Reshape dataframe from long format to wide</span> |
| <span class="sd"> format.</span> |
| <span class="sd"> DataFrame.pivot_table : Create a spreadsheet-style pivot table</span> |
| <span class="sd"> as a DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is named by analogy with a collection of books</span> |
| <span class="sd"> being reorganized from being side by side on a horizontal</span> |
| <span class="sd"> position (the columns of the dataframe) to being stacked</span> |
| <span class="sd"> vertically on top of each other (in the index of the</span> |
| <span class="sd"> dataframe).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> **Single level columns**</span> |
| |
| <span class="sd"> >>> df_single_level_cols = ps.DataFrame([[0, 1], [2, 3]],</span> |
| <span class="sd"> ... index=['cat', 'dog'],</span> |
| <span class="sd"> ... columns=['weight', 'height'])</span> |
| |
| <span class="sd"> Stacking a dataframe with a single level column axis returns a Series:</span> |
| |
| <span class="sd"> >>> df_single_level_cols</span> |
| <span class="sd"> weight height</span> |
| <span class="sd"> cat 0 1</span> |
| <span class="sd"> dog 2 3</span> |
| <span class="sd"> >>> df_single_level_cols.stack().sort_index()</span> |
| <span class="sd"> cat height 1</span> |
| <span class="sd"> weight 0</span> |
| <span class="sd"> dog height 3</span> |
| <span class="sd"> weight 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> **Multi level columns: simple case**</span> |
| |
| <span class="sd"> >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),</span> |
| <span class="sd"> ... ('weight', 'pounds')])</span> |
| <span class="sd"> >>> df_multi_level_cols1 = ps.DataFrame([[1, 2], [2, 4]],</span> |
| <span class="sd"> ... index=['cat', 'dog'],</span> |
| <span class="sd"> ... columns=multicol1)</span> |
| |
| <span class="sd"> Stacking a dataframe with a multi-level column axis:</span> |
| |
| <span class="sd"> >>> df_multi_level_cols1 # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> weight</span> |
| <span class="sd"> kg pounds</span> |
| <span class="sd"> cat 1 2</span> |
| <span class="sd"> dog 2 4</span> |
| <span class="sd"> >>> df_multi_level_cols1.stack().sort_index()</span> |
| <span class="sd"> weight</span> |
| <span class="sd"> cat kg 1</span> |
| <span class="sd"> pounds 2</span> |
| <span class="sd"> dog kg 2</span> |
| <span class="sd"> pounds 4</span> |
| |
| <span class="sd"> **Missing values**</span> |
| |
| <span class="sd"> >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),</span> |
| <span class="sd"> ... ('height', 'm')])</span> |
| <span class="sd"> >>> df_multi_level_cols2 = ps.DataFrame([[1.0, 2.0], [3.0, 4.0]],</span> |
| <span class="sd"> ... index=['cat', 'dog'],</span> |
| <span class="sd"> ... columns=multicol2)</span> |
| |
| <span class="sd"> It is common to have missing values when stacking a dataframe</span> |
| <span class="sd"> with multi-level columns, as the stacked dataframe typically</span> |
| <span class="sd"> has more values than the original dataframe. Missing values</span> |
| <span class="sd"> are filled with NaNs:</span> |
| |
| <span class="sd"> >>> df_multi_level_cols2</span> |
| <span class="sd"> weight height</span> |
| <span class="sd"> kg m</span> |
| <span class="sd"> cat 1.0 2.0</span> |
| <span class="sd"> dog 3.0 4.0</span> |
| <span class="sd"> >>> df_multi_level_cols2.stack().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> height weight</span> |
| <span class="sd"> cat kg NaN 1.0</span> |
| <span class="sd"> m 2.0 NaN</span> |
| <span class="sd"> dog kg NaN 3.0</span> |
| <span class="sd"> m 4.0 NaN</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">dict</span><span class="p">)</span> <span class="c1"># type: Union[defaultdict, OrderedDict]</span> |
| <span class="n">index_values</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="n">should_returns_series</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">new_label</span> <span class="o">=</span> <span class="n">label</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_label</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">new_label</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">should_returns_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="n">label</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="p">[</span><span class="n">new_label</span><span class="p">][</span><span class="n">value</span><span class="p">]</span> <span class="o">=</span> <span class="n">scol</span> |
| |
| <span class="n">index_values</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">OrderedDict</span><span class="p">(</span><span class="nb">sorted</span><span class="p">(</span><span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">(),</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span> |
| |
| <span class="n">index_name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_label_names</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">]</span> |
| |
| <span class="n">index_column</span> <span class="o">=</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span> |
| |
| <span class="n">structs</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_column</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="p">(</span> |
| <span class="n">column_labels</span><span class="p">[</span><span class="n">label</span><span class="p">][</span><span class="n">value</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> |
| <span class="k">else</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">data_columns</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">index_values</span> |
| <span class="p">]</span> |
| |
| <span class="n">pairs</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="n">structs</span><span class="p">))</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"pairs"</span><span class="p">,</span> <span class="n">pairs</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">sdf</span><span class="p">[</span><span class="s2">"pairs"</span><span class="p">][</span><span class="n">index_column</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_column</span><span class="p">)]</span> |
| <span class="o">+</span> <span class="p">[</span><span class="n">sdf</span><span class="p">[</span><span class="s2">"pairs"</span><span class="p">][</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_column</span><span class="p">])</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_name</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> <span class="o">+</span> <span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">column_labels</span><span class="p">),</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> <span class="c1"># type: "DataFrame"</span> |
| |
| <span class="k">if</span> <span class="n">should_returns_series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.unstack"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.unstack.html#pyspark.pandas.DataFrame.unstack">[docs]</a> <span class="k">def</span> <span class="nf">unstack</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameOrSeries</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Pivot the (necessarily hierarchical) index labels.</span> |
| |
| <span class="sd"> Returns a DataFrame having a new level of column labels whose inner-most level</span> |
| <span class="sd"> consists of the pivoted index labels.</span> |
| |
| <span class="sd"> If the index is not a MultiIndex, the output will be a Series.</span> |
| |
| <span class="sd"> .. note:: If the index is a MultiIndex, the output DataFrame could be very wide, and</span> |
| <span class="sd"> it could cause a serious performance degradation since Spark partitions it row based.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.pivot : Pivot a table based on column values.</span> |
| <span class="sd"> DataFrame.stack : Pivot a level of the column labels (inverse operation from unstack).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": {"0": "a", "1": "b", "2": "c"},</span> |
| <span class="sd"> ... "B": {"0": "1", "1": "3", "2": "5"},</span> |
| <span class="sd"> ... "C": {"0": "2", "1": "4", "2": "6"}},</span> |
| <span class="sd"> ... columns=["A", "B", "C"])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 a 1 2</span> |
| <span class="sd"> 1 b 3 4</span> |
| <span class="sd"> 2 c 5 6</span> |
| |
| <span class="sd"> >>> df.unstack().sort_index()</span> |
| <span class="sd"> A 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> B 0 1</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 5</span> |
| <span class="sd"> C 0 2</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> >>> df.columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')])</span> |
| <span class="sd"> >>> df.unstack().sort_index()</span> |
| <span class="sd"> X A 0 a</span> |
| <span class="sd"> 1 b</span> |
| <span class="sd"> 2 c</span> |
| <span class="sd"> B 0 1</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 5</span> |
| <span class="sd"> Y C 0 2</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> dtype: object</span> |
| |
| <span class="sd"> For MultiIndex case:</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({"A": ["a", "b", "c"],</span> |
| <span class="sd"> ... "B": [1, 3, 5],</span> |
| <span class="sd"> ... "C": [2, 4, 6]},</span> |
| <span class="sd"> ... columns=["A", "B", "C"])</span> |
| <span class="sd"> >>> df = df.set_index('A', append=True)</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 a 1 2</span> |
| <span class="sd"> 1 b 3 4</span> |
| <span class="sd"> 2 c 5 6</span> |
| <span class="sd"> >>> df.unstack().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A a b c a b c</span> |
| <span class="sd"> 0 1.0 NaN NaN 2.0 NaN NaN</span> |
| <span class="sd"> 1 NaN 3.0 NaN NaN 4.0 NaN</span> |
| <span class="sd"> 2 NaN NaN 5.0 NaN NaN 6.0</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="c1"># The index after `reset_index()` will never be used, so use "distributed" index</span> |
| <span class="c1"># as a dummy to avoid overhead.</span> |
| <span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.default_index_type"</span><span class="p">,</span> <span class="s2">"distributed"</span><span class="p">):</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span> |
| <span class="n">index</span><span class="o">=</span><span class="n">index</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="n">values</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">aggfunc</span><span class="o">=</span><span class="s2">"first"</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">[:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="p">(</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="kc">None</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="k">else</span> <span class="n">df</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: Codes here are similar with melt. Should we deduplicate?</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="n">ser_name</span> <span class="o">=</span> <span class="n">SPARK_DEFAULT_SERIES_NAME</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">new_index_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">)</span> |
| <span class="p">]</span> |
| |
| <span class="n">new_index_map</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">zip_longest</span><span class="p">(</span><span class="n">new_index_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> <span class="p">[]))</span> |
| |
| <span class="n">pairs</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">idx</span><span class="p">,</span> <span class="n">new_index_columns</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">ser_name</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">idx</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"pairs.</span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">new_index_columns</span><span class="p">[:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">]</span> |
| <span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"pairs.</span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">ser_name</span><span class="p">)]</span> |
| |
| <span class="n">new_index_len</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_index_columns</span><span class="p">)</span> |
| <span class="n">existing_index_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">index_field</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span> |
| <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="n">new_index_len</span><span class="p">)</span> |
| <span class="n">new_index_map</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">name</span><span class="p">,</span> <span class="n">index_name</span><span class="p">,</span> <span class="n">index_field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)))</span> |
| <span class="n">existing_index_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| |
| <span class="n">exploded_df</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"pairs"</span><span class="p">,</span> <span class="n">pairs</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">existing_index_columns</span> <span class="o">+</span> <span class="n">columns</span><span class="p">)</span> |
| |
| <span class="n">index_spark_column_names</span><span class="p">,</span> <span class="n">index_names</span><span class="p">,</span> <span class="n">index_fields</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">new_index_map</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span> |
| <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">exploded_df</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">exploded_df</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_names</span><span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">index_fields</span><span class="p">),</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: axis, skipna, and many arguments should be implemented.</span> |
| <div class="viewcode-block" id="DataFrame.all"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.all.html#pyspark.pandas.DataFrame.all">[docs]</a> <span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return whether all elements are True.</span> |
| |
| <span class="sd"> Returns True unless there is at least one element within a series that is</span> |
| <span class="sd"> False or equivalent (e.g. zero or empty)</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or 'index'}, default 0</span> |
| <span class="sd"> Indicate which axis or axes should be reduced.</span> |
| |
| <span class="sd"> * 0 / 'index' : reduce the index, return a Series whose index is the</span> |
| <span class="sd"> original column labels.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Create a dataframe from a dictionary.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'col1': [True, True, True],</span> |
| <span class="sd"> ... 'col2': [True, False, False],</span> |
| <span class="sd"> ... 'col3': [0, 0, 0],</span> |
| <span class="sd"> ... 'col4': [1, 2, 3],</span> |
| <span class="sd"> ... 'col5': [True, True, None],</span> |
| <span class="sd"> ... 'col6': [True, False, None]},</span> |
| <span class="sd"> ... columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6'])</span> |
| |
| <span class="sd"> Default behaviour checks if column-wise values all return a boolean.</span> |
| |
| <span class="sd"> >>> df.all()</span> |
| <span class="sd"> col1 True</span> |
| <span class="sd"> col2 False</span> |
| <span class="sd"> col3 False</span> |
| <span class="sd"> col4 True</span> |
| <span class="sd"> col5 True</span> |
| <span class="sd"> col6 False</span> |
| <span class="sd"> dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">all_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">all_col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">all_col</span><span class="p">))</span> |
| |
| <span class="c1"># TODO: there is a similar logic to transpose in, for instance,</span> |
| <span class="c1"># DataFrame.any, Series.quantile. Maybe we should deduplicate it.</span> |
| <span class="n">value_column</span> <span class="o">=</span> <span class="s2">"value"</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">applied_col</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">applied</span><span class="p">):</span> |
| <span class="n">cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">label</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">applied_col</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">value_column</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"arrays"</span><span class="p">))</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"arrays"</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s2">"col.*"</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">value_column</span><span class="p">)],</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: axis, skipna, and many arguments should be implemented.</span> |
| <div class="viewcode-block" id="DataFrame.any"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.any.html#pyspark.pandas.DataFrame.any">[docs]</a> <span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return whether any element is True.</span> |
| |
| <span class="sd"> Returns False unless there is at least one element within a series that is</span> |
| <span class="sd"> True or equivalent (e.g. non-zero or non-empty).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or 'index'}, default 0</span> |
| <span class="sd"> Indicate which axis or axes should be reduced.</span> |
| |
| <span class="sd"> * 0 / 'index' : reduce the index, return a Series whose index is the</span> |
| <span class="sd"> original column labels.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Create a dataframe from a dictionary.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'col1': [False, False, False],</span> |
| <span class="sd"> ... 'col2': [True, False, False],</span> |
| <span class="sd"> ... 'col3': [0, 0, 1],</span> |
| <span class="sd"> ... 'col4': [0, 1, 2],</span> |
| <span class="sd"> ... 'col5': [False, False, None],</span> |
| <span class="sd"> ... 'col6': [True, False, None]},</span> |
| <span class="sd"> ... columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6'])</span> |
| |
| <span class="sd"> Default behaviour checks if column-wise values all return a boolean.</span> |
| |
| <span class="sd"> >>> df.any()</span> |
| <span class="sd"> col1 False</span> |
| <span class="sd"> col2 True</span> |
| <span class="sd"> col3 True</span> |
| <span class="sd"> col4 True</span> |
| <span class="sd"> col5 False</span> |
| <span class="sd"> col6 True</span> |
| <span class="sd"> dtype: bool</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">all_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)))</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">all_col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">all_col</span><span class="p">))</span> |
| |
| <span class="c1"># TODO: there is a similar logic to transpose in, for instance,</span> |
| <span class="c1"># DataFrame.all, Series.quantile. Maybe we should deduplicate it.</span> |
| <span class="n">value_column</span> <span class="o">=</span> <span class="s2">"value"</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">applied_col</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">applied</span><span class="p">):</span> |
| <span class="n">cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">label</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">applied_col</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">value_column</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"arrays"</span><span class="p">))</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"arrays"</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s2">"col.*"</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">value_column</span><span class="p">)],</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add axis, numeric_only, pct, na_option parameter</span> |
| <div class="viewcode-block" id="DataFrame.rank"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rank.html#pyspark.pandas.DataFrame.rank">[docs]</a> <span class="k">def</span> <span class="nf">rank</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"average"</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute numerical data ranks (1 through n) along axis. Equal values are</span> |
| <span class="sd"> assigned a rank that is the average of the ranks of those values.</span> |
| |
| <span class="sd"> .. note:: the current implementation of rank uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> method : {'average', 'min', 'max', 'first', 'dense'}</span> |
| <span class="sd"> * average: average rank of group</span> |
| <span class="sd"> * min: lowest rank in group</span> |
| <span class="sd"> * max: highest rank in group</span> |
| <span class="sd"> * first: ranks assigned in order they appear in the array</span> |
| <span class="sd"> * dense: like 'min', but rank always increases by 1 between groups</span> |
| <span class="sd"> ascending : boolean, default True</span> |
| <span class="sd"> False for ranks by high (1) to low (N)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> ranks : same type as caller</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 3, 2, 1]}, columns= ['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 3</span> |
| <span class="sd"> 2 2 2</span> |
| <span class="sd"> 3 3 1</span> |
| |
| <span class="sd"> >>> df.rank().sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 4.0</span> |
| <span class="sd"> 1 2.5 3.0</span> |
| <span class="sd"> 2 2.5 2.0</span> |
| <span class="sd"> 3 4.0 1.0</span> |
| |
| <span class="sd"> If method is set to 'min', it use lowest rank in group.</span> |
| |
| <span class="sd"> >>> df.rank(method='min').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 4.0</span> |
| <span class="sd"> 1 2.0 3.0</span> |
| <span class="sd"> 2 2.0 2.0</span> |
| <span class="sd"> 3 4.0 1.0</span> |
| |
| <span class="sd"> If method is set to 'max', it use highest rank in group.</span> |
| |
| <span class="sd"> >>> df.rank(method='max').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 4.0</span> |
| <span class="sd"> 1 3.0 3.0</span> |
| <span class="sd"> 2 3.0 2.0</span> |
| <span class="sd"> 3 4.0 1.0</span> |
| |
| <span class="sd"> If method is set to 'dense', it leaves no gaps in group.</span> |
| |
| <span class="sd"> >>> df.rank(method='dense').sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 4.0</span> |
| <span class="sd"> 1 2.0 3.0</span> |
| <span class="sd"> 2 2.0 2.0</span> |
| <span class="sd"> 3 3.0 1.0</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_rank</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.filter"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.filter.html#pyspark.pandas.DataFrame.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">items</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">like</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">regex</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Subset rows or columns of dataframe according to labels in</span> |
| <span class="sd"> the specified index.</span> |
| |
| <span class="sd"> Note that this routine does not filter a dataframe on its</span> |
| <span class="sd"> contents. The filter is applied to the labels of the index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> items : list-like</span> |
| <span class="sd"> Keep labels from axis which are in items.</span> |
| <span class="sd"> like : string</span> |
| <span class="sd"> Keep labels from axis for which "like in label == True".</span> |
| <span class="sd"> regex : string (regular expression)</span> |
| <span class="sd"> Keep labels from axis for which re.search(regex, label) == True.</span> |
| <span class="sd"> axis : int or string axis name</span> |
| <span class="sd"> The axis to filter on. By default this is the info axis,</span> |
| <span class="sd"> 'index' for Series, 'columns' for DataFrame.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> same type as input object</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.loc</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The ``items``, ``like``, and ``regex`` parameters are</span> |
| <span class="sd"> enforced to be mutually exclusive.</span> |
| |
| <span class="sd"> ``axis`` defaults to the info axis that is used when indexing</span> |
| <span class="sd"> with ``[]``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),</span> |
| <span class="sd"> ... index=['mouse', 'rabbit'],</span> |
| <span class="sd"> ... columns=['one', 'two', 'three'])</span> |
| |
| <span class="sd"> >>> # select columns by name</span> |
| <span class="sd"> >>> df.filter(items=['one', 'three'])</span> |
| <span class="sd"> one three</span> |
| <span class="sd"> mouse 1 3</span> |
| <span class="sd"> rabbit 4 6</span> |
| |
| <span class="sd"> >>> # select columns by regular expression</span> |
| <span class="sd"> >>> df.filter(regex='e$', axis=1)</span> |
| <span class="sd"> one three</span> |
| <span class="sd"> mouse 1 3</span> |
| <span class="sd"> rabbit 4 6</span> |
| |
| <span class="sd"> >>> # select rows containing 'bbi'</span> |
| <span class="sd"> >>> df.filter(like='bbi', axis=0)</span> |
| <span class="sd"> one two three</span> |
| <span class="sd"> rabbit 4 5 6</span> |
| |
| <span class="sd"> For a Series,</span> |
| |
| <span class="sd"> >>> # select rows by name</span> |
| <span class="sd"> >>> df.one.filter(items=['rabbit'])</span> |
| <span class="sd"> rabbit 4</span> |
| <span class="sd"> Name: one, dtype: int64</span> |
| |
| <span class="sd"> >>> # select rows by regular expression</span> |
| <span class="sd"> >>> df.one.filter(regex='e$')</span> |
| <span class="sd"> mouse 1</span> |
| <span class="sd"> Name: one, dtype: int64</span> |
| |
| <span class="sd"> >>> # select rows containing 'bbi'</span> |
| <span class="sd"> >>> df.one.filter(like='bbi')</span> |
| <span class="sd"> rabbit 4</span> |
| <span class="sd"> Name: one, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">sum</span><span class="p">(</span><span class="n">x</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">items</span><span class="p">,</span> <span class="n">like</span><span class="p">,</span> <span class="n">regex</span><span class="p">))</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Keyword arguments `items`, `like`, or `regex` "</span> <span class="s2">"are mutually exclusive"</span> |
| <span class="p">)</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">,</span> <span class="n">none_axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">index_scols</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| |
| <span class="k">if</span> <span class="n">items</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">items</span><span class="p">):</span> |
| <span class="n">items</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">items</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"items should be a list-like object."</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_scols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">items</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">index_scols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">|</span> <span class="p">(</span><span class="n">index_scols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">item</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_scols</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="c1"># for multi-index</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">items</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">item</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Unsupported type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">item</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">item</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"The item should not be empty."</span><span class="p">)</span> |
| <span class="n">midx_col</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">element</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">midx_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">midx_col</span> <span class="o">=</span> <span class="n">index_scols</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">element</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">midx_col</span> <span class="o">=</span> <span class="n">midx_col</span> <span class="o">&</span> <span class="p">(</span><span class="n">index_scols</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">element</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">midx_col</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">|</span> <span class="n">midx_col</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="n">items</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">like</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">for</span> <span class="n">index_scol</span> <span class="ow">in</span> <span class="n">index_scols</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">index_scol</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">like</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">|</span> <span class="n">index_scol</span><span class="o">.</span><span class="n">contains</span><span class="p">(</span><span class="n">like</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="n">output_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">like</span> <span class="ow">in</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">label</span><span class="p">)]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="n">output_labels</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">regex</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">for</span> <span class="n">index_scol</span> <span class="ow">in</span> <span class="n">index_scols</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">index_scol</span><span class="o">.</span><span class="n">rlike</span><span class="p">(</span><span class="n">regex</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">|</span> <span class="n">index_scol</span><span class="o">.</span><span class="n">rlike</span><span class="p">(</span><span class="n">regex</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="n">matcher</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">regex</span><span class="p">)</span> |
| <span class="n">output_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">matcher</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">label</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="n">output_labels</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Must pass either `items`, `like`, or `regex`"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.rename"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rename.html#pyspark.pandas.DataFrame.rename">[docs]</a> <span class="k">def</span> <span class="nf">rename</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">mapper</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="s2">"index"</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"ignore"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd"> Alter axes labels.</span> |
| <span class="sd"> Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series</span> |
| <span class="sd"> will be left as-is. Extra labels listed don’t throw an error.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> mapper : dict-like or function</span> |
| <span class="sd"> Dict-like or functions transformations to apply to that axis’ values.</span> |
| <span class="sd"> Use either `mapper` and `axis` to specify the axis to target with `mapper`, or `index`</span> |
| <span class="sd"> and `columns`.</span> |
| <span class="sd"> index : dict-like or function</span> |
| <span class="sd"> Alternative to specifying axis ("mapper, axis=0" is equivalent to "index=mapper").</span> |
| <span class="sd"> columns : dict-like or function</span> |
| <span class="sd"> Alternative to specifying axis ("mapper, axis=1" is equivalent to "columns=mapper").</span> |
| <span class="sd"> axis : int or str, default 'index'</span> |
| <span class="sd"> Axis to target with mapper. Can be either the axis name ('index', 'columns') or</span> |
| <span class="sd"> number (0, 1).</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Whether to return a new DataFrame.</span> |
| <span class="sd"> level : int or level name, default None</span> |
| <span class="sd"> In case of a MultiIndex, only rename labels in the specified level.</span> |
| <span class="sd"> errors : {'ignore', 'raise}, default 'ignore'</span> |
| <span class="sd"> If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, or `columns`</span> |
| <span class="sd"> contains labels that are not present in the Index being transformed. If 'ignore',</span> |
| <span class="sd"> existing keys will be renamed and extra keys will be ignored.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame with the renamed axis labels.</span> |
| |
| <span class="sd"> Raises</span> |
| <span class="sd"> ------</span> |
| <span class="sd"> `KeyError`</span> |
| <span class="sd"> If any of the labels is not found in the selected axis and "errors='raise'".</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf1 = ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})</span> |
| <span class="sd"> >>> psdf1.rename(columns={"A": "a", "B": "c"}) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a c</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 5</span> |
| <span class="sd"> 2 3 6</span> |
| |
| <span class="sd"> >>> psdf1.rename(index={1: 10, 2: 20}) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 10 2 5</span> |
| <span class="sd"> 20 3 6</span> |
| |
| <span class="sd"> >>> def str_lower(s) -> str:</span> |
| <span class="sd"> ... return str.lower(s)</span> |
| <span class="sd"> >>> psdf1.rename(str_lower, axis='columns') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 1 2 5</span> |
| <span class="sd"> 2 3 6</span> |
| |
| <span class="sd"> >>> def mul10(x) -> int:</span> |
| <span class="sd"> ... return x * 10</span> |
| <span class="sd"> >>> psdf1.rename(mul10, axis='index') # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 4</span> |
| <span class="sd"> 10 2 5</span> |
| <span class="sd"> 20 3 6</span> |
| |
| <span class="sd"> >>> idx = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Y', 'D')])</span> |
| <span class="sd"> >>> psdf2 = ps.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx)</span> |
| <span class="sd"> >>> psdf2.rename(columns=str_lower, level=0) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> x y</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2 3 4</span> |
| <span class="sd"> 1 5 6 7 8</span> |
| |
| <span class="sd"> >>> psdf3 = ps.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list('ab'))</span> |
| <span class="sd"> >>> psdf3.rename(index=str_lower) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> x a 1 2</span> |
| <span class="sd"> b 3 4</span> |
| <span class="sd"> y c 5 6</span> |
| <span class="sd"> d 7 8</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">gen_mapper_fn</span><span class="p">(</span> |
| <span class="n">mapper</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Any</span><span class="p">],</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">DataType</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">mapper</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span> |
| <span class="n">mapper_dict</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="n">mapper</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">mapper_dict</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">"raise"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"Index include label which is not in the `mapper`."</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| |
| <span class="n">type_set</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">mapper_dict</span><span class="o">.</span><span class="n">values</span><span class="p">()))</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">type_set</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Mapper dict should have the same value type."</span><span class="p">)</span> |
| <span class="n">dtype</span><span class="p">,</span> <span class="n">spark_return_type</span> <span class="o">=</span> <span class="n">pandas_on_spark_type</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">type_set</span><span class="p">)[</span><span class="mi">0</span><span class="p">])</span> |
| |
| <span class="k">def</span> <span class="nf">mapper_fn</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">mapper_dict</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">mapper_dict</span><span class="p">[</span><span class="n">x</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">"raise"</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"Index include value which is not in the `mapper`"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">x</span> |
| |
| <span class="k">elif</span> <span class="n">callable</span><span class="p">(</span><span class="n">mapper</span><span class="p">):</span> |
| <span class="n">mapper_callable</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Callable</span><span class="p">,</span> <span class="n">mapper</span><span class="p">)</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">ScalarType</span><span class="p">,</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">mapper</span><span class="p">))</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">spark_return_type</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span> |
| |
| <span class="k">def</span> <span class="nf">mapper_fn</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">mapper_callable</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"`mapper` or `index` or `columns` should be "</span> |
| <span class="s2">"either dict-like or function type."</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">mapper_fn</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">spark_return_type</span> |
| |
| <span class="n">index_mapper_fn</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">index_mapper_ret_stype</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">columns_mapper_fn</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">mapper</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">index_mapper_fn</span><span class="p">,</span> <span class="n">index_mapper_ret_dtype</span><span class="p">,</span> <span class="n">index_mapper_ret_stype</span> <span class="o">=</span> <span class="n">gen_mapper_fn</span><span class="p">(</span> |
| <span class="n">mapper</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">columns_mapper_fn</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">gen_mapper_fn</span><span class="p">(</span><span class="n">mapper</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">index</span><span class="p">:</span> |
| <span class="n">index_mapper_fn</span><span class="p">,</span> <span class="n">index_mapper_ret_dtype</span><span class="p">,</span> <span class="n">index_mapper_ret_stype</span> <span class="o">=</span> <span class="n">gen_mapper_fn</span><span class="p">(</span> |
| <span class="n">index</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">columns</span><span class="p">:</span> |
| <span class="n">columns_mapper_fn</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">gen_mapper_fn</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">index</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">columns</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Either `index` or `columns` should be provided."</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">index_mapper_fn</span><span class="p">:</span> |
| <span class="c1"># rename index labels, if `level` is None, rename all index columns, otherwise only</span> |
| <span class="c1"># rename the corresponding level index.</span> |
| <span class="c1"># implement this by transform the underlying spark dataframe,</span> |
| <span class="c1"># Example:</span> |
| <span class="c1"># suppose the psdf index column in underlying spark dataframe is "index_0", "index_1",</span> |
| <span class="c1"># if rename level 0 index labels, will do:</span> |
| <span class="c1"># ``psdf._sdf.withColumn("index_0", mapper_fn_udf(col("index_0"))``</span> |
| <span class="c1"># if rename all index labels (`level` is None), then will do:</span> |
| <span class="c1"># ```</span> |
| <span class="c1"># psdf._sdf.withColumn("index_0", mapper_fn_udf(col("index_0"))</span> |
| <span class="c1"># .withColumn("index_1", mapper_fn_udf(col("index_1"))</span> |
| <span class="c1"># ```</span> |
| |
| <span class="n">index_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="n">num_indices</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_columns</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">level</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="o"><</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">level</span> <span class="o">>=</span> <span class="n">num_indices</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"level should be an integer between [0, num_indices)"</span><span class="p">)</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="n">index_mapper_ret_stype</span><span class="p">)</span> <span class="c1"># type: ignore</span> |
| <span class="k">def</span> <span class="nf">index_mapper_udf</span><span class="p">(</span><span class="n">s</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">s</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">index_mapper_fn</span><span class="p">)</span> |
| |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">num_indices</span><span class="p">):</span> |
| <span class="n">index_spark_columns</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">index_mapper_udf</span><span class="p">(</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span> |
| <span class="n">index_columns</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="n">index_fields</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">index_fields</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">index_mapper_ret_dtype</span><span class="p">,</span> |
| <span class="n">spark_type</span><span class="o">=</span><span class="n">index_mapper_ret_stype</span><span class="p">,</span> |
| <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_spark_columns</span><span class="p">[</span><span class="n">level</span><span class="p">]</span> <span class="o">=</span> <span class="n">index_mapper_udf</span><span class="p">(</span><span class="n">index_spark_columns</span><span class="p">[</span><span class="n">level</span><span class="p">])</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span> |
| <span class="n">index_columns</span><span class="p">[</span><span class="n">level</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="n">index_fields</span><span class="p">[</span><span class="n">level</span><span class="p">]</span> <span class="o">=</span> <span class="n">index_fields</span><span class="p">[</span><span class="n">level</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">index_mapper_ret_dtype</span><span class="p">,</span> |
| <span class="n">spark_type</span><span class="o">=</span><span class="n">index_mapper_ret_stype</span><span class="p">,</span> |
| <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">columns_mapper_fn</span><span class="p">:</span> |
| <span class="c1"># rename column name.</span> |
| <span class="c1"># Will modify the `_internal._column_labels` and transform underlying spark dataframe</span> |
| <span class="c1"># to the same column name with `_internal._column_labels`.</span> |
| <span class="k">if</span> <span class="n">level</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="o"><</span> <span class="mi">0</span> <span class="ow">or</span> <span class="n">level</span> <span class="o">>=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"level should be an integer between [0, column_labels_level)"</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">gen_new_column_labels_entry</span><span class="p">(</span><span class="n">column_labels_entry</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-></span> <span class="n">Label</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="c1"># rename all level columns</span> |
| <span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">columns_mapper_fn</span><span class="p">,</span> <span class="n">column_labels_entry</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># only rename specified level column</span> |
| <span class="n">entry_list</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">column_labels_entry</span><span class="p">)</span> |
| <span class="n">entry_list</span><span class="p">[</span><span class="n">level</span><span class="p">]</span> <span class="o">=</span> <span class="n">columns_mapper_fn</span><span class="p">(</span><span class="n">entry_list</span><span class="p">[</span><span class="n">level</span><span class="p">])</span> |
| <span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">entry_list</span><span class="p">)</span> |
| |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">gen_new_column_labels_entry</span><span class="p">,</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span> |
| |
| <span class="n">new_data_pssers</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">old_label</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">new_label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">old_label</span><span class="p">,</span> <span class="n">new_label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">new_column_labels</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">new_data_pssers</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.rename_axis"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.rename_axis.html#pyspark.pandas.DataFrame.rename_axis">[docs]</a> <span class="k">def</span> <span class="nf">rename_axis</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">mapper</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Name</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">index</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Name</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Name</span><span class="p">],</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Set the name of the axis for the index or columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> mapper : scalar, list-like, optional</span> |
| <span class="sd"> A scalar, list-like, dict-like or functions transformations to</span> |
| <span class="sd"> apply to the axis name attribute.</span> |
| <span class="sd"> index, columns : scalar, list-like, dict-like or function, optional</span> |
| <span class="sd"> A scalar, list-like, dict-like or functions transformations to</span> |
| <span class="sd"> apply to that axis' values.</span> |
| |
| <span class="sd"> Use either ``mapper`` and ``axis`` to</span> |
| <span class="sd"> specify the axis to target with ``mapper``, or ``index``</span> |
| <span class="sd"> and/or ``columns``.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns'}, default 0</span> |
| <span class="sd"> The axis to rename.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> Modifies the object directly, instead of creating a new DataFrame.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame, or None if `inplace` is True.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.rename : Alter Series index labels or name.</span> |
| <span class="sd"> DataFrame.rename : Alter DataFrame index labels or name.</span> |
| <span class="sd"> Index.rename : Set new names on index.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> ``DataFrame.rename_axis`` supports two calling conventions</span> |
| |
| <span class="sd"> * ``(index=index_mapper, columns=columns_mapper, ...)``</span> |
| <span class="sd"> * ``(mapper, axis={'index', 'columns'}, ...)``</span> |
| |
| <span class="sd"> The first calling convention will only modify the names of</span> |
| <span class="sd"> the index and/or the names of the Index object that is the columns.</span> |
| |
| <span class="sd"> The second calling convention will modify the names of the</span> |
| <span class="sd"> corresponding index specified by axis.</span> |
| |
| <span class="sd"> We *highly* recommend using keyword arguments to clarify your</span> |
| <span class="sd"> intent.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"num_legs": [4, 4, 2],</span> |
| <span class="sd"> ... "num_arms": [0, 0, 2]},</span> |
| <span class="sd"> ... index=["dog", "cat", "monkey"],</span> |
| <span class="sd"> ... columns=["num_legs", "num_arms"])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> num_legs num_arms</span> |
| <span class="sd"> dog 4 0</span> |
| <span class="sd"> cat 4 0</span> |
| <span class="sd"> monkey 2 2</span> |
| |
| <span class="sd"> >>> df = df.rename_axis("animal").sort_index()</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_arms</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> cat 4 0</span> |
| <span class="sd"> dog 4 0</span> |
| <span class="sd"> monkey 2 2</span> |
| |
| <span class="sd"> >>> df = df.rename_axis("limbs", axis="columns").sort_index()</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> limbs num_legs num_arms</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> cat 4 0</span> |
| <span class="sd"> dog 4 0</span> |
| <span class="sd"> monkey 2 2</span> |
| |
| <span class="sd"> **MultiIndex**</span> |
| |
| <span class="sd"> >>> index = pd.MultiIndex.from_product([['mammal'],</span> |
| <span class="sd"> ... ['dog', 'cat', 'monkey']],</span> |
| <span class="sd"> ... names=['type', 'name'])</span> |
| <span class="sd"> >>> df = ps.DataFrame({"num_legs": [4, 4, 2],</span> |
| <span class="sd"> ... "num_arms": [0, 0, 2]},</span> |
| <span class="sd"> ... index=index,</span> |
| <span class="sd"> ... columns=["num_legs", "num_arms"])</span> |
| <span class="sd"> >>> df # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_arms</span> |
| <span class="sd"> type name</span> |
| <span class="sd"> mammal dog 4 0</span> |
| <span class="sd"> cat 4 0</span> |
| <span class="sd"> monkey 2 2</span> |
| |
| <span class="sd"> >>> df.rename_axis(index={'type': 'class'}).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_arms</span> |
| <span class="sd"> class name</span> |
| <span class="sd"> mammal cat 4 0</span> |
| <span class="sd"> dog 4 0</span> |
| <span class="sd"> monkey 2 2</span> |
| |
| <span class="sd"> >>> df.rename_axis(index=str.upper).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> num_legs num_arms</span> |
| <span class="sd"> TYPE NAME</span> |
| <span class="sd"> mammal cat 4 0</span> |
| <span class="sd"> dog 4 0</span> |
| <span class="sd"> monkey 2 2</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">gen_names</span><span class="p">(</span> |
| <span class="n">v</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Name</span><span class="p">],</span> <span class="n">Any</span><span class="p">]],</span> |
| <span class="n">curnames</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="n">is_scalar</span><span class="p">(</span><span class="n">v</span><span class="p">):</span> |
| <span class="n">newnames</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Any</span><span class="p">,</span> <span class="n">v</span><span class="p">)]</span> <span class="c1"># type: List[Name]</span> |
| <span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">is_dict_like</span><span class="p">(</span><span class="n">v</span><span class="p">):</span> |
| <span class="n">newnames</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">v</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="n">is_dict_like</span><span class="p">(</span><span class="n">v</span><span class="p">):</span> |
| <span class="n">v_dict</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">v</span><span class="p">)</span> |
| <span class="n">newnames</span> <span class="o">=</span> <span class="p">[</span><span class="n">v_dict</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">v_dict</span> <span class="k">else</span> <span class="n">name</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">curnames</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">callable</span><span class="p">(</span><span class="n">v</span><span class="p">):</span> |
| <span class="n">v_callable</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Name</span><span class="p">],</span> <span class="n">Any</span><span class="p">],</span> <span class="n">v</span><span class="p">)</span> |
| <span class="n">newnames</span> <span class="o">=</span> <span class="p">[</span><span class="n">v_callable</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">curnames</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"`mapper` or `index` or `columns` should be "</span> |
| <span class="s2">"either dict-like or function type."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">newnames</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">curnames</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Length of new names must be </span><span class="si">{}</span><span class="s2">, got </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">curnames</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">newnames</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="p">[</span><span class="n">name</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">newnames</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">mapper</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="p">(</span><span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Cannot specify both 'mapper' and any of 'index' or 'columns'."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">mapper</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="n">mapper</span> |
| <span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">columns</span> <span class="o">=</span> <span class="n">mapper</span> |
| |
| <span class="n">column_label_names</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">gen_names</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">names</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span> |
| <span class="p">)</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">gen_names</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span><span class="p">)</span> <span class="k">if</span> <span class="n">index</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> <span class="n">column_label_names</span><span class="o">=</span><span class="n">column_label_names</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.keys"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.keys.html#pyspark.pandas.DataFrame.keys">[docs]</a> <span class="k">def</span> <span class="nf">keys</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return alias for columns.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Index</span> |
| <span class="sd"> Columns of the DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]],</span> |
| <span class="sd"> ... index=['cobra', 'viper', 'sidewinder'],</span> |
| <span class="sd"> ... columns=['max_speed', 'shield'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> max_speed shield</span> |
| <span class="sd"> cobra 1 2</span> |
| <span class="sd"> viper 4 5</span> |
| <span class="sd"> sidewinder 7 8</span> |
| |
| <span class="sd"> >>> df.keys()</span> |
| <span class="sd"> Index(['max_speed', 'shield'], dtype='object')</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">columns</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.pct_change"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.pct_change.html#pyspark.pandas.DataFrame.pct_change">[docs]</a> <span class="k">def</span> <span class="nf">pct_change</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Percentage change between the current and a prior element.</span> |
| |
| <span class="sd"> .. note:: the current implementation of this API uses Spark's Window without</span> |
| <span class="sd"> specifying partition specification. This leads to move all data into</span> |
| <span class="sd"> single partition in single machine and could cause serious</span> |
| <span class="sd"> performance degradation. Avoid this method against very large dataset.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int, default 1</span> |
| <span class="sd"> Periods to shift for forming percent change.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Percentage change in French franc, Deutsche Mark, and Italian lira</span> |
| <span class="sd"> from 1980-01-01 to 1980-03-01.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'FR': [4.0405, 4.0963, 4.3149],</span> |
| <span class="sd"> ... 'GR': [1.7246, 1.7482, 1.8519],</span> |
| <span class="sd"> ... 'IT': [804.74, 810.01, 860.13]},</span> |
| <span class="sd"> ... index=['1980-01-01', '1980-02-01', '1980-03-01'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> FR GR IT</span> |
| <span class="sd"> 1980-01-01 4.0405 1.7246 804.74</span> |
| <span class="sd"> 1980-02-01 4.0963 1.7482 810.01</span> |
| <span class="sd"> 1980-03-01 4.3149 1.8519 860.13</span> |
| |
| <span class="sd"> >>> df.pct_change()</span> |
| <span class="sd"> FR GR IT</span> |
| <span class="sd"> 1980-01-01 NaN NaN NaN</span> |
| <span class="sd"> 1980-02-01 0.013810 0.013684 0.006549</span> |
| <span class="sd"> 1980-03-01 0.053365 0.059318 0.061876</span> |
| |
| <span class="sd"> You can set periods to shift for forming percent change</span> |
| |
| <span class="sd"> >>> df.pct_change(2)</span> |
| <span class="sd"> FR GR IT</span> |
| <span class="sd"> 1980-01-01 NaN NaN NaN</span> |
| <span class="sd"> 1980-02-01 NaN NaN NaN</span> |
| <span class="sd"> 1980-03-01 0.067912 0.073814 0.06883</span> |
| <span class="sd"> """</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="n">periods</span><span class="p">,</span> <span class="o">-</span><span class="n">periods</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">op</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">prev_row</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> |
| <span class="k">return</span> <span class="p">((</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="o">-</span> <span class="n">prev_row</span><span class="p">)</span> <span class="o">/</span> <span class="n">prev_row</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="n">op</span><span class="p">,</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: axis = 1</span> |
| <div class="viewcode-block" id="DataFrame.idxmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.idxmax.html#pyspark.pandas.DataFrame.idxmax">[docs]</a> <span class="k">def</span> <span class="nf">idxmax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return index of first occurrence of maximum over requested axis.</span> |
| <span class="sd"> NA/null values are excluded.</span> |
| |
| <span class="sd"> .. note:: This API collect all rows with maximum value using `to_pandas()`</span> |
| <span class="sd"> because we suppose the number of rows with max values are usually small in general.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmax</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2],</span> |
| <span class="sd"> ... 'b': [4.0, 2.0, 3.0, 1.0],</span> |
| <span class="sd"> ... 'c': [300, 200, 400, 200]})</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 4.0 300</span> |
| <span class="sd"> 1 2 2.0 200</span> |
| <span class="sd"> 2 3 3.0 400</span> |
| <span class="sd"> 3 2 1.0 200</span> |
| |
| <span class="sd"> >>> psdf.idxmax()</span> |
| <span class="sd"> a 2</span> |
| <span class="sd"> b 0</span> |
| <span class="sd"> c 2</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> For Multi-column Index</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2],</span> |
| <span class="sd"> ... 'b': [4.0, 2.0, 3.0, 1.0],</span> |
| <span class="sd"> ... 'c': [300, 200, 400, 200]})</span> |
| <span class="sd"> >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> x y z</span> |
| <span class="sd"> 0 1 4.0 300</span> |
| <span class="sd"> 1 2 2.0 200</span> |
| <span class="sd"> 2 3 3.0 400</span> |
| <span class="sd"> 3 2 1.0 200</span> |
| |
| <span class="sd"> >>> psdf.idxmax()</span> |
| <span class="sd"> a x 2</span> |
| <span class="sd"> b y 0</span> |
| <span class="sd"> c z 2</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">max_cols</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> |
| <span class="n">sdf_max</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">max_cols</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| <span class="c1"># `sdf_max` looks like below</span> |
| <span class="c1"># +------+------+------+</span> |
| <span class="c1"># |(a, x)|(b, y)|(c, z)|</span> |
| <span class="c1"># +------+------+------+</span> |
| <span class="c1"># | 3| 4.0| 400|</span> |
| <span class="c1"># +------+------+------+</span> |
| |
| <span class="n">conds</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">scol</span> <span class="o">==</span> <span class="n">max_val</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">max_val</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">,</span> <span class="n">sdf_max</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">|</span> <span class="n">y</span><span class="p">,</span> <span class="n">conds</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> <span class="c1"># type: "DataFrame"</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">idxmax</span><span class="p">()))</span></div> |
| |
| <span class="c1"># TODO: axis = 1</span> |
| <div class="viewcode-block" id="DataFrame.idxmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.idxmin.html#pyspark.pandas.DataFrame.idxmin">[docs]</a> <span class="k">def</span> <span class="nf">idxmin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return index of first occurrence of minimum over requested axis.</span> |
| <span class="sd"> NA/null values are excluded.</span> |
| |
| <span class="sd"> .. note:: This API collect all rows with minimum value using `to_pandas()`</span> |
| <span class="sd"> because we suppose the number of rows with min values are usually small in general.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmin</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2],</span> |
| <span class="sd"> ... 'b': [4.0, 2.0, 3.0, 1.0],</span> |
| <span class="sd"> ... 'c': [300, 200, 400, 200]})</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 4.0 300</span> |
| <span class="sd"> 1 2 2.0 200</span> |
| <span class="sd"> 2 3 3.0 400</span> |
| <span class="sd"> 3 2 1.0 200</span> |
| |
| <span class="sd"> >>> psdf.idxmin()</span> |
| <span class="sd"> a 0</span> |
| <span class="sd"> b 3</span> |
| <span class="sd"> c 1</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> For Multi-column Index</span> |
| |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2],</span> |
| <span class="sd"> ... 'b': [4.0, 2.0, 3.0, 1.0],</span> |
| <span class="sd"> ... 'c': [300, 200, 400, 200]})</span> |
| <span class="sd"> >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> x y z</span> |
| <span class="sd"> 0 1 4.0 300</span> |
| <span class="sd"> 1 2 2.0 200</span> |
| <span class="sd"> 2 3 3.0 400</span> |
| <span class="sd"> 3 2 1.0 200</span> |
| |
| <span class="sd"> >>> psdf.idxmin()</span> |
| <span class="sd"> a x 0</span> |
| <span class="sd"> b y 3</span> |
| <span class="sd"> c z 1</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">min_cols</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">scol</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">scol</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> |
| <span class="n">sdf_min</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">min_cols</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> |
| |
| <span class="n">conds</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">scol</span> <span class="o">==</span> <span class="n">min_val</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">min_val</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">,</span> <span class="n">sdf_min</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">|</span> <span class="n">y</span><span class="p">,</span> <span class="n">conds</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">))</span> <span class="c1"># type: "DataFrame"</span> |
| |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">idxmin</span><span class="p">()))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.info"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.info.html#pyspark.pandas.DataFrame.info">[docs]</a> <span class="k">def</span> <span class="nf">info</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">verbose</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">max_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">null_counts</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Print a concise summary of a DataFrame.</span> |
| |
| <span class="sd"> This method prints information about a DataFrame including</span> |
| <span class="sd"> the index dtype and column dtypes, non-null values and memory usage.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> verbose : bool, optional</span> |
| <span class="sd"> Whether to print the full summary.</span> |
| <span class="sd"> buf : writable buffer, defaults to sys.stdout</span> |
| <span class="sd"> Where to send the output. By default, the output is printed to</span> |
| <span class="sd"> sys.stdout. Pass a writable buffer if you need to further process</span> |
| <span class="sd"> the output.</span> |
| <span class="sd"> max_cols : int, optional</span> |
| <span class="sd"> When to switch from the verbose to the truncated output. If the</span> |
| <span class="sd"> DataFrame has more than `max_cols` columns, the truncated output</span> |
| <span class="sd"> is used.</span> |
| <span class="sd"> null_counts : bool, optional</span> |
| <span class="sd"> Whether to show the non-null counts.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> None</span> |
| <span class="sd"> This method prints a summary of a DataFrame and returns None.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.describe: Generate descriptive statistics of DataFrame</span> |
| <span class="sd"> columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> int_values = [1, 2, 3, 4, 5]</span> |
| <span class="sd"> >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']</span> |
| <span class="sd"> >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... {"int_col": int_values, "text_col": text_values, "float_col": float_values},</span> |
| <span class="sd"> ... columns=['int_col', 'text_col', 'float_col'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> int_col text_col float_col</span> |
| <span class="sd"> 0 1 alpha 0.00</span> |
| <span class="sd"> 1 2 beta 0.25</span> |
| <span class="sd"> 2 3 gamma 0.50</span> |
| <span class="sd"> 3 4 delta 0.75</span> |
| <span class="sd"> 4 5 epsilon 1.00</span> |
| |
| <span class="sd"> Prints information of all columns:</span> |
| |
| <span class="sd"> >>> df.info(verbose=True) # doctest: +SKIP</span> |
| <span class="sd"> <class 'pyspark.pandas.frame.DataFrame'></span> |
| <span class="sd"> Index: 5 entries, 0 to 4</span> |
| <span class="sd"> Data columns (total 3 columns):</span> |
| <span class="sd"> # Column Non-Null Count Dtype</span> |
| <span class="sd"> --- ------ -------------- -----</span> |
| <span class="sd"> 0 int_col 5 non-null int64</span> |
| <span class="sd"> 1 text_col 5 non-null object</span> |
| <span class="sd"> 2 float_col 5 non-null float64</span> |
| <span class="sd"> dtypes: float64(1), int64(1), object(1)</span> |
| |
| <span class="sd"> Prints a summary of columns count and its dtypes but not per column</span> |
| <span class="sd"> information:</span> |
| |
| <span class="sd"> >>> df.info(verbose=False) # doctest: +SKIP</span> |
| <span class="sd"> <class 'pyspark.pandas.frame.DataFrame'></span> |
| <span class="sd"> Index: 5 entries, 0 to 4</span> |
| <span class="sd"> Columns: 3 entries, int_col to float_col</span> |
| <span class="sd"> dtypes: float64(1), int64(1), object(1)</span> |
| |
| <span class="sd"> Pipe output of DataFrame.info to buffer instead of sys.stdout, get</span> |
| <span class="sd"> buffer content and writes to a text file:</span> |
| |
| <span class="sd"> >>> import io</span> |
| <span class="sd"> >>> buffer = io.StringIO()</span> |
| <span class="sd"> >>> df.info(buf=buffer)</span> |
| <span class="sd"> >>> s = buffer.getvalue()</span> |
| <span class="sd"> >>> with open('%s/info.txt' % path, "w",</span> |
| <span class="sd"> ... encoding="utf-8") as f:</span> |
| <span class="sd"> ... _ = f.write(s)</span> |
| <span class="sd"> >>> with open('%s/info.txt' % path) as f:</span> |
| <span class="sd"> ... f.readlines() # doctest: +SKIP</span> |
| <span class="sd"> ["<class 'pyspark.pandas.frame.DataFrame'>\\n",</span> |
| <span class="sd"> 'Index: 5 entries, 0 to 4\\n',</span> |
| <span class="sd"> 'Data columns (total 3 columns):\\n',</span> |
| <span class="sd"> ' # Column Non-Null Count Dtype \\n',</span> |
| <span class="sd"> '--- ------ -------------- ----- \\n',</span> |
| <span class="sd"> ' 0 int_col 5 non-null int64 \\n',</span> |
| <span class="sd"> ' 1 text_col 5 non-null object \\n',</span> |
| <span class="sd"> ' 2 float_col 5 non-null float64\\n',</span> |
| <span class="sd"> 'dtypes: float64(1), int64(1), object(1)']</span> |
| <span class="sd"> """</span> |
| <span class="c1"># To avoid pandas' existing config affects pandas-on-Spark.</span> |
| <span class="c1"># TODO: should we have corresponding pandas-on-Spark configs?</span> |
| <span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span> |
| <span class="s2">"display.max_info_columns"</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span><span class="p">,</span> <span class="s2">"display.max_info_rows"</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxsize</span> |
| <span class="p">):</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="c1"># hack to use pandas' info as is.</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_data"</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="n">count_func</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">count</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">count</span> <span class="o">=</span> <span class="k">lambda</span><span class="p">:</span> <span class="n">count_func</span><span class="p">()</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span> <span class="c1"># type: ignore</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">info</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">verbose</span><span class="o">=</span><span class="n">verbose</span><span class="p">,</span> |
| <span class="n">buf</span><span class="o">=</span><span class="n">buf</span><span class="p">,</span> |
| <span class="n">max_cols</span><span class="o">=</span><span class="n">max_cols</span><span class="p">,</span> |
| <span class="n">memory_usage</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">null_counts</span><span class="o">=</span><span class="n">null_counts</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">finally</span><span class="p">:</span> |
| <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">_data</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">count</span> <span class="o">=</span> <span class="n">count_func</span> <span class="c1"># type: ignore</span></div> |
| |
| <span class="c1"># TODO: fix parameter 'axis' and 'numeric_only' to work same as pandas'</span> |
| <div class="viewcode-block" id="DataFrame.quantile"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.quantile.html#pyspark.pandas.DataFrame.quantile">[docs]</a> <span class="k">def</span> <span class="nf">quantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">q</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="mf">0.5</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameOrSeries</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return value at the given quantile.</span> |
| |
| <span class="sd"> .. note:: Unlike pandas', the quantile in pandas-on-Spark is an approximated quantile</span> |
| <span class="sd"> based upon approximate percentile computation because computing quantile across a</span> |
| <span class="sd"> large dataset is extremely expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> q : float or array-like, default 0.5 (50% quantile)</span> |
| <span class="sd"> 0 <= q <= 1, the quantile(s) to compute.</span> |
| <span class="sd"> axis : int or str, default 0 or 'index'</span> |
| <span class="sd"> Can only be set to 0 at the moment.</span> |
| <span class="sd"> numeric_only : bool, default True</span> |
| <span class="sd"> If False, the quantile of datetime and timedelta data will be computed as well.</span> |
| <span class="sd"> Can only be set to True at the moment.</span> |
| <span class="sd"> accuracy : int, optional</span> |
| <span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span> |
| <span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> If q is an array, a DataFrame will be returned where the</span> |
| <span class="sd"> index is q, the columns are the columns of self, and the values are the quantiles.</span> |
| <span class="sd"> If q is a float, a Series will be returned where the</span> |
| <span class="sd"> index is the columns of self and the values are the quantiles.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [6, 7, 8, 9, 0]})</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 6</span> |
| <span class="sd"> 1 2 7</span> |
| <span class="sd"> 2 3 8</span> |
| <span class="sd"> 3 4 9</span> |
| <span class="sd"> 4 5 0</span> |
| |
| <span class="sd"> >>> psdf.quantile(.5)</span> |
| <span class="sd"> a 3.0</span> |
| <span class="sd"> b 7.0</span> |
| <span class="sd"> Name: 0.5, dtype: float64</span> |
| |
| <span class="sd"> >>> psdf.quantile([.25, .5, .75])</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0.25 2.0 6.0</span> |
| <span class="sd"> 0.50 3.0 7.0</span> |
| <span class="sd"> 0.75 4.0 8.0</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">'axis should be either 0 or "index" currently.'</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <span class="n">qq</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">q</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">q</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">)</span> <span class="k">else</span> <span class="n">q</span> <span class="c1"># type: Union[float, List[float]]</span> |
| |
| <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">qq</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">qq</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">else</span> <span class="p">[</span><span class="n">qq</span><span class="p">]:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">float</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"q must be a float or an array of floats; however, [</span><span class="si">%s</span><span class="s2">] found."</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">v</span> <span class="o"><</span> <span class="mf">0.0</span> <span class="ow">or</span> <span class="n">v</span> <span class="o">></span> <span class="mf">1.0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"percentiles should all be in the interval [0, 1]."</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">quantile</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">"Series"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="p">(</span><span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)):</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="n">qq</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">qq</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="c1"># First calculate the percentiles from all columns and map it to each `quantiles`</span> |
| <span class="c1"># by creating each entry as a struct. So, it becomes an array of structs as below:</span> |
| <span class="c1">#</span> |
| <span class="c1"># +-----------------------------------------+</span> |
| <span class="c1"># | arrays|</span> |
| <span class="c1"># +-----------------------------------------+</span> |
| <span class="c1"># |[[0.25, 2, 6], [0.5, 3, 7], [0.75, 4, 8]]|</span> |
| <span class="c1"># +-----------------------------------------+</span> |
| |
| <span class="n">percentile_cols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">percentile_col_names</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">column</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="n">is_numeric_or_boolean</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">keep_column</span> <span class="o">=</span> <span class="ow">not</span> <span class="n">numeric_only</span> <span class="ow">or</span> <span class="n">is_numeric_or_boolean</span> |
| |
| <span class="k">if</span> <span class="n">keep_column</span><span class="p">:</span> |
| <span class="n">percentile_col</span> <span class="o">=</span> <span class="n">quantile</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| <span class="n">percentile_cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">percentile_col</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">column</span><span class="p">))</span> |
| <span class="n">percentile_col_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">percentile_cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">qq</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">percentile_cols</span><span class="p">)</span> |
| <span class="c1"># Here, after select percentile cols, a spark_frame looks like below:</span> |
| <span class="c1"># +---------+---------+</span> |
| <span class="c1"># | a| b|</span> |
| <span class="c1"># +---------+---------+</span> |
| <span class="c1"># |[2, 3, 4]|[6, 7, 8]|</span> |
| <span class="c1"># +---------+---------+</span> |
| |
| <span class="n">cols_dict</span> <span class="o">=</span> <span class="n">OrderedDict</span><span class="p">()</span> <span class="c1"># type: OrderedDict</span> |
| <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">percentile_col_names</span><span class="p">:</span> |
| <span class="n">cols_dict</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> <span class="o">=</span> <span class="nb">list</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">qq</span><span class="p">)):</span> |
| <span class="n">cols_dict</span><span class="p">[</span><span class="n">column</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">column</span><span class="p">)[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">column</span><span class="p">))</span> |
| |
| <span class="n">internal_index_column</span> <span class="o">=</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">cols_dict</span><span class="o">.</span><span class="n">values</span><span class="p">())):</span> |
| <span class="n">cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">qq</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">internal_index_column</span><span class="p">),</span> <span class="o">*</span><span class="n">col</span><span class="p">))</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">"arrays"</span><span class="p">))</span> |
| |
| <span class="c1"># And then, explode it and manually set the index.</span> |
| <span class="c1"># +-----------------+---+---+</span> |
| <span class="c1"># |__index_level_0__| a| b|</span> |
| <span class="c1"># +-----------------+---+---+</span> |
| <span class="c1"># | 0.25| 2| 6|</span> |
| <span class="c1"># | 0.5| 3| 7|</span> |
| <span class="c1"># | 0.75| 4| 8|</span> |
| <span class="c1"># +-----------------+---+---+</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">"arrays"</span><span class="p">)))</span><span class="o">.</span><span class="n">selectExpr</span><span class="p">(</span><span class="s2">"col.*"</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">internal_index_column</span><span class="p">)],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">percentile_col_names</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">quantile</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"quantile"</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">qq</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.query"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.query.html#pyspark.pandas.DataFrame.query">[docs]</a> <span class="k">def</span> <span class="nf">query</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">expr</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Query the columns of a DataFrame with a boolean expression.</span> |
| |
| <span class="sd"> .. note:: Internal columns that starting with a '__' prefix are able to access, however,</span> |
| <span class="sd"> they are not supposed to be accessed.</span> |
| |
| <span class="sd"> .. note:: This API delegates to Spark SQL so the syntax follows Spark SQL. Therefore, the</span> |
| <span class="sd"> pandas specific syntax such as `@` is not supported. If you want the pandas syntax,</span> |
| <span class="sd"> you can work around with :meth:`DataFrame.pandas_on_spark.apply_batch`, but you should</span> |
| <span class="sd"> be aware that `query_func` will be executed at different nodes in a distributed manner.</span> |
| <span class="sd"> So, for example, to use `@` syntax, make sure the variable is serialized by, for</span> |
| <span class="sd"> example, putting it within the closure as below.</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': range(2000), 'B': range(2000)})</span> |
| <span class="sd"> >>> def query_func(pdf):</span> |
| <span class="sd"> ... num = 1995</span> |
| <span class="sd"> ... return pdf.query('A > @num')</span> |
| <span class="sd"> >>> df.pandas_on_spark.apply_batch(query_func)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1996 1996 1996</span> |
| <span class="sd"> 1997 1997 1997</span> |
| <span class="sd"> 1998 1998 1998</span> |
| <span class="sd"> 1999 1999 1999</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> expr : str</span> |
| <span class="sd"> The query string to evaluate.</span> |
| |
| <span class="sd"> You can refer to column names that contain spaces by surrounding</span> |
| <span class="sd"> them in backticks.</span> |
| |
| <span class="sd"> For example, if one of your columns is called ``a a`` and you want</span> |
| <span class="sd"> to sum it with ``b``, your query should be ```a a` + b``.</span> |
| |
| <span class="sd"> inplace : bool</span> |
| <span class="sd"> Whether the query should modify the data in place or return</span> |
| <span class="sd"> a modified copy.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame resulting from the provided query expression.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': range(1, 6),</span> |
| <span class="sd"> ... 'B': range(10, 0, -2),</span> |
| <span class="sd"> ... 'C C': range(10, 5, -1)})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C C</span> |
| <span class="sd"> 0 1 10 10</span> |
| <span class="sd"> 1 2 8 9</span> |
| <span class="sd"> 2 3 6 8</span> |
| <span class="sd"> 3 4 4 7</span> |
| <span class="sd"> 4 5 2 6</span> |
| |
| <span class="sd"> >>> df.query('A > B')</span> |
| <span class="sd"> A B C C</span> |
| <span class="sd"> 4 5 2 6</span> |
| |
| <span class="sd"> The previous expression is equivalent to</span> |
| |
| <span class="sd"> >>> df[df.A > df.B]</span> |
| <span class="sd"> A B C C</span> |
| <span class="sd"> 4 5 2 6</span> |
| |
| <span class="sd"> For columns with spaces in their name, you can use backtick quoting.</span> |
| |
| <span class="sd"> >>> df.query('B == `C C`')</span> |
| <span class="sd"> A B C C</span> |
| <span class="sd"> 0 1 10 10</span> |
| |
| <span class="sd"> The previous expression is equivalent to</span> |
| |
| <span class="sd"> >>> df[df.B == df['C C']]</span> |
| <span class="sd"> A B C C</span> |
| <span class="sd"> 0 1 10 10</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Doesn't support for MultiIndex columns"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">expr</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"expr must be a string to be evaluated, </span><span class="si">{}</span><span class="s2"> given"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">expr</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">,</span> <span class="n">data_columns</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">expr</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">data_columns</span><span class="o">=</span><span class="n">data_columns</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.take"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.take.html#pyspark.pandas.DataFrame.take">[docs]</a> <span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">indices</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the elements in the given *positional* indices along an axis.</span> |
| |
| <span class="sd"> This means that we are not indexing according to actual values in</span> |
| <span class="sd"> the index attribute of the object. We are indexing according to the</span> |
| <span class="sd"> actual position of the element in the object.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> indices : array-like</span> |
| <span class="sd"> An array of ints indicating which positions to take.</span> |
| <span class="sd"> axis : {0 or 'index', 1 or 'columns', None}, default 0</span> |
| <span class="sd"> The axis on which to select elements. ``0`` means that we are</span> |
| <span class="sd"> selecting rows, ``1`` means that we are selecting columns.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> For compatibility with :meth:`numpy.take`. Has no effect on the</span> |
| <span class="sd"> output.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> taken : same type as caller</span> |
| <span class="sd"> An array-like containing the elements taken from the object.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.loc : Select a subset of a DataFrame by labels.</span> |
| <span class="sd"> DataFrame.iloc : Select a subset of a DataFrame by positions.</span> |
| <span class="sd"> numpy.take : Take elements from an array along an axis.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([('falcon', 'bird', 389.0),</span> |
| <span class="sd"> ... ('parrot', 'bird', 24.0),</span> |
| <span class="sd"> ... ('lion', 'mammal', 80.5),</span> |
| <span class="sd"> ... ('monkey', 'mammal', np.nan)],</span> |
| <span class="sd"> ... columns=['name', 'class', 'max_speed'],</span> |
| <span class="sd"> ... index=[0, 2, 3, 1])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 2 parrot bird 24.0</span> |
| <span class="sd"> 3 lion mammal 80.5</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| |
| <span class="sd"> Take elements at positions 0 and 3 along the axis 0 (default).</span> |
| |
| <span class="sd"> Note how the actual indices selected (0 and 1) do not correspond to</span> |
| <span class="sd"> our selected indices 0 and 3. That's because we are selecting the 0th</span> |
| <span class="sd"> and 3rd rows, not rows whose indices equal 0 and 3.</span> |
| |
| <span class="sd"> >>> df.take([0, 3]).sort_index()</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| |
| <span class="sd"> Take elements at indices 1 and 2 along the axis 1 (column selection).</span> |
| |
| <span class="sd"> >>> df.take([1, 2], axis=1)</span> |
| <span class="sd"> class max_speed</span> |
| <span class="sd"> 0 bird 389.0</span> |
| <span class="sd"> 2 bird 24.0</span> |
| <span class="sd"> 3 mammal 80.5</span> |
| <span class="sd"> 1 mammal NaN</span> |
| |
| <span class="sd"> We may take elements using negative integers for positive indices,</span> |
| <span class="sd"> starting from the end of the object, just like with Python lists.</span> |
| |
| <span class="sd"> >>> df.take([-1, -2]).sort_index()</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| <span class="sd"> 3 lion mammal 80.5</span> |
| <span class="sd"> """</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">indices</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">indices</span><span class="p">,</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"`indices` must be a list-like except dict or set"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">indices</span><span class="p">,</span> <span class="p">:])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="n">indices</span><span class="p">])</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.eval"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.eval.html#pyspark.pandas.DataFrame.eval">[docs]</a> <span class="k">def</span> <span class="nf">eval</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">expr</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">DataFrameOrSeries</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Evaluate a string describing operations on DataFrame columns.</span> |
| |
| <span class="sd"> Operates on columns only, not specific rows or elements. This allows</span> |
| <span class="sd"> `eval` to run arbitrary code, which can make you vulnerable to code</span> |
| <span class="sd"> injection if you pass user input to this function.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> expr : str</span> |
| <span class="sd"> The expression string to evaluate.</span> |
| <span class="sd"> inplace : bool, default False</span> |
| <span class="sd"> If the expression contains an assignment, whether to perform the</span> |
| <span class="sd"> operation inplace and mutate the existing DataFrame. Otherwise,</span> |
| <span class="sd"> a new DataFrame is returned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> The result of the evaluation.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.query : Evaluates a boolean expression to query the columns</span> |
| <span class="sd"> of a frame.</span> |
| <span class="sd"> DataFrame.assign : Can evaluate an expression or function to create new</span> |
| <span class="sd"> values for a column.</span> |
| <span class="sd"> eval : Evaluate a Python expression as a string using various</span> |
| <span class="sd"> backends.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 10</span> |
| <span class="sd"> 1 2 8</span> |
| <span class="sd"> 2 3 6</span> |
| <span class="sd"> 3 4 4</span> |
| <span class="sd"> 4 5 2</span> |
| <span class="sd"> >>> df.eval('A + B')</span> |
| <span class="sd"> 0 11</span> |
| <span class="sd"> 1 10</span> |
| <span class="sd"> 2 9</span> |
| <span class="sd"> 3 8</span> |
| <span class="sd"> 4 7</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> Assignment is allowed though by default the original DataFrame is not</span> |
| <span class="sd"> modified.</span> |
| |
| <span class="sd"> >>> df.eval('C = A + B')</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 10 11</span> |
| <span class="sd"> 1 2 8 10</span> |
| <span class="sd"> 2 3 6 9</span> |
| <span class="sd"> 3 4 4 8</span> |
| <span class="sd"> 4 5 2 7</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 10</span> |
| <span class="sd"> 1 2 8</span> |
| <span class="sd"> 2 3 6</span> |
| <span class="sd"> 3 4 4</span> |
| <span class="sd"> 4 5 2</span> |
| |
| <span class="sd"> Use ``inplace=True`` to modify the original DataFrame.</span> |
| |
| <span class="sd"> >>> df.eval('C = A + B', inplace=True)</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 10 11</span> |
| <span class="sd"> 1 2 8 10</span> |
| <span class="sd"> 2 3 6 9</span> |
| <span class="sd"> 3 4 4 8</span> |
| <span class="sd"> 4 5 2 7</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"`eval` is not supported for multi-index columns"</span><span class="p">)</span> |
| <span class="n">inplace</span> <span class="o">=</span> <span class="n">validate_bool_kwarg</span><span class="p">(</span><span class="n">inplace</span><span class="p">,</span> <span class="s2">"inplace"</span><span class="p">)</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="n">series_name</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">should_return_scalar</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="c1"># Since `eval_func` doesn't have a type hint, inferring the schema is always preformed</span> |
| <span class="c1"># in the `apply_batch`. Hence, the variables `should_return_series`, `series_name`,</span> |
| <span class="c1"># and `should_return_scalar` can be updated.</span> |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">eval_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">):</span> |
| <span class="k">nonlocal</span> <span class="n">should_return_series</span> |
| <span class="k">nonlocal</span> <span class="n">series_name</span> |
| <span class="k">nonlocal</span> <span class="n">should_return_scalar</span> |
| <span class="n">result_inner</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">expr</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="n">result_inner</span> <span class="o">=</span> <span class="n">pdf</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result_inner</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">series_name</span> <span class="o">=</span> <span class="n">result_inner</span><span class="o">.</span><span class="n">name</span> |
| <span class="n">result_inner</span> <span class="o">=</span> <span class="n">result_inner</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="k">elif</span> <span class="n">is_scalar</span><span class="p">(</span><span class="n">result_inner</span><span class="p">):</span> |
| <span class="n">should_return_scalar</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">result_inner</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">result_inner</span><span class="p">)</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">result_inner</span> |
| |
| <span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">apply_batch</span><span class="p">(</span><span class="n">eval_func</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">inplace</span><span class="p">:</span> |
| <span class="c1"># Here, the result is always a frame because the error is thrown during schema inference</span> |
| <span class="c1"># from pandas.</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="n">requires_same_anchor</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">elif</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">result</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">series_name</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">should_return_scalar</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">result</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Returns a frame</span> |
| <span class="k">return</span> <span class="n">result</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.explode"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.explode.html#pyspark.pandas.DataFrame.explode">[docs]</a> <span class="k">def</span> <span class="nf">explode</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">column</span><span class="p">:</span> <span class="n">Name</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Transform each element of a list-like to a row, replicating index values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> column : str or tuple</span> |
| <span class="sd"> Column to explode.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Exploded lists to rows of the subset columns;</span> |
| <span class="sd"> index will be duplicated for these rows.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.unstack : Pivot a level of the (necessarily hierarchical)</span> |
| <span class="sd"> index labels.</span> |
| <span class="sd"> DataFrame.melt : Unpivot a DataFrame from wide format to long format.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [[1, 2, 3], [], [3, 4]], 'B': 1})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 [1, 2, 3] 1</span> |
| <span class="sd"> 1 [] 1</span> |
| <span class="sd"> 2 [3, 4] 1</span> |
| |
| <span class="sd"> >>> df.explode('A')</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1.0 1</span> |
| <span class="sd"> 0 2.0 1</span> |
| <span class="sd"> 0 3.0 1</span> |
| <span class="sd"> 1 NaN 1</span> |
| <span class="sd"> 2 3.0 1</span> |
| <span class="sd"> 2 4.0 1</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">column</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"column must be a scalar"</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">)</span> <span class="c1"># type: "DataFrame"</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"The column </span><span class="si">%s</span><span class="s2"> is not unique. For a multi-index, the label must be a tuple "</span> |
| <span class="s2">"with elements corresponding to each level."</span> <span class="o">%</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">column</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">ArrayType</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">F</span><span class="o">.</span><span class="n">explode_outer</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">idx</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="o">.</span><span class="n">index</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="n">field</span> <span class="o">=</span> <span class="n">data_fields</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">ArrayType</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">spark_type</span><span class="p">)</span><span class="o">.</span><span class="n">elementType</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">)</span> |
| <span class="n">data_fields</span><span class="p">[</span><span class="n">idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">spark_type</span><span class="o">=</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.mad"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.mad.html#pyspark.pandas.DataFrame.mad">[docs]</a> <span class="k">def</span> <span class="nf">mad</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Series"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the mean absolute deviation of values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {index (0), columns (1)}</span> |
| <span class="sd"> Axis for the function to be applied on.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]},</span> |
| <span class="sd"> ... columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.mad()</span> |
| <span class="sd"> a 0.666667</span> |
| <span class="sd"> b 0.066667</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> df.mad(axis=1)</span> |
| <span class="sd"> 0 0.45</span> |
| <span class="sd"> 1 0.90</span> |
| <span class="sd"> 2 1.35</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span> |
| |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">get_spark_column</span><span class="p">(</span><span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">col_type</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"integer"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">scol</span> |
| |
| <span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># type: List[Label]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="c1"># Filtering out only columns of numeric and boolean type column.</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)):</span> |
| <span class="n">new_column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| |
| <span class="n">new_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">avg</span><span class="p">(</span><span class="n">get_spark_column</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">label</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">new_column_labels</span> |
| <span class="p">]</span> |
| |
| <span class="n">mean_data</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">new_columns</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">()</span> |
| |
| <span class="n">new_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">avg</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">get_spark_column</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="o">-</span> <span class="n">mean_data</span><span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)])</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">new_column_labels</span> |
| <span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">StringType</span><span class="p">())</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)],</span> <span class="o">*</span><span class="n">new_columns</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># The data is expected to be small so it's fine to transpose/use default index.</span> |
| <span class="k">with</span> <span class="n">ps</span><span class="o">.</span><span class="n">option_context</span><span class="p">(</span><span class="s2">"compute.max_rows"</span><span class="p">,</span> <span class="mi">1</span><span class="p">):</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">)],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">new_column_labels</span><span class="p">,</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">transpose</span><span class="p">())</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="n">returnType</span><span class="o">=</span><span class="n">DoubleType</span><span class="p">())</span> <span class="c1"># type: ignore</span> |
| <span class="k">def</span> <span class="nf">calculate_columns_axis</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">cols</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">mad</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">calculate_columns_axis</span><span class="p">(</span><span class="o">*</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span> |
| <span class="n">SPARK_DEFAULT_SERIES_NAME</span> |
| <span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.tail"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.tail.html#pyspark.pandas.DataFrame.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the last `n` rows.</span> |
| |
| <span class="sd"> This function returns last `n` rows from the object based on</span> |
| <span class="sd"> position. It is useful for quickly verifying data, for example,</span> |
| <span class="sd"> after sorting or appending rows.</span> |
| |
| <span class="sd"> For negative values of `n`, this function returns all rows except</span> |
| <span class="sd"> the first `n` rows, equivalent to ``df[n:]``.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, default 5</span> |
| <span class="sd"> Number of rows to select.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> type of caller</span> |
| <span class="sd"> The last `n` rows of the caller object.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.head : The first `n` rows of the caller object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',</span> |
| <span class="sd"> ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 0 alligator</span> |
| <span class="sd"> 1 bee</span> |
| <span class="sd"> 2 falcon</span> |
| <span class="sd"> 3 lion</span> |
| <span class="sd"> 4 monkey</span> |
| <span class="sd"> 5 parrot</span> |
| <span class="sd"> 6 shark</span> |
| <span class="sd"> 7 whale</span> |
| <span class="sd"> 8 zebra</span> |
| |
| <span class="sd"> Viewing the last 5 lines</span> |
| |
| <span class="sd"> >>> df.tail() # doctest: +SKIP</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 4 monkey</span> |
| <span class="sd"> 5 parrot</span> |
| <span class="sd"> 6 shark</span> |
| <span class="sd"> 7 whale</span> |
| <span class="sd"> 8 zebra</span> |
| |
| <span class="sd"> Viewing the last `n` lines (three in this case)</span> |
| |
| <span class="sd"> >>> df.tail(3) # doctest: +SKIP</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 6 shark</span> |
| <span class="sd"> 7 whale</span> |
| <span class="sd"> 8 zebra</span> |
| |
| <span class="sd"> For negative values of `n`</span> |
| |
| <span class="sd"> >>> df.tail(-3) # doctest: +SKIP</span> |
| <span class="sd"> animal</span> |
| <span class="sd"> 3 lion</span> |
| <span class="sd"> 4 monkey</span> |
| <span class="sd"> 5 parrot</span> |
| <span class="sd"> 6 shark</span> |
| <span class="sd"> 7 whale</span> |
| <span class="sd"> 8 zebra</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"bad operand type for unary -: '</span><span class="si">{}</span><span class="s2">'"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">n</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">+</span> <span class="n">n</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o"><=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)))</span> |
| <span class="c1"># Should use `resolved_copy` here for the case like `(psdf + 1).tail()`</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> |
| <span class="n">new_sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">new_sdf</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.align"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.align.html#pyspark.pandas.DataFrame.align">[docs]</a> <span class="k">def</span> <span class="nf">align</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">other</span><span class="p">:</span> <span class="n">DataFrameOrSeries</span><span class="p">,</span> |
| <span class="n">join</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"outer"</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">copy</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">,</span> <span class="n">DataFrameOrSeries</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Align two objects on their axes with the specified join method.</span> |
| |
| <span class="sd"> Join method is specified for each axis Index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : DataFrame or Series</span> |
| <span class="sd"> join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'</span> |
| <span class="sd"> axis : allowed axis of the other object, default None</span> |
| <span class="sd"> Align on index (0), columns (1), or both (None).</span> |
| <span class="sd"> copy : bool, default True</span> |
| <span class="sd"> Always returns new objects. If copy=False and no reindexing is</span> |
| <span class="sd"> required then original objects are returned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> (left, right) : (DataFrame, type of other)</span> |
| <span class="sd"> Aligned objects.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> ps.set_option("compute.ops_on_diff_frames", True)</span> |
| <span class="sd"> >>> df1 = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30])</span> |
| <span class="sd"> >>> df2 = ps.DataFrame({"a": [4, 5, 6], "c": ["d", "e", "f"]}, index=[10, 11, 12])</span> |
| |
| <span class="sd"> Align both axis:</span> |
| |
| <span class="sd"> >>> aligned_l, aligned_r = df1.align(df2)</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 10 1.0 a NaN</span> |
| <span class="sd"> 11 NaN None NaN</span> |
| <span class="sd"> 12 NaN None NaN</span> |
| <span class="sd"> 20 2.0 b NaN</span> |
| <span class="sd"> 30 3.0 c NaN</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 10 4.0 NaN d</span> |
| <span class="sd"> 11 5.0 NaN e</span> |
| <span class="sd"> 12 6.0 NaN f</span> |
| <span class="sd"> 20 NaN NaN None</span> |
| <span class="sd"> 30 NaN NaN None</span> |
| |
| <span class="sd"> Align only axis=0 (index):</span> |
| |
| <span class="sd"> >>> aligned_l, aligned_r = df1.align(df2, axis=0)</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 10 1.0 a</span> |
| <span class="sd"> 11 NaN None</span> |
| <span class="sd"> 12 NaN None</span> |
| <span class="sd"> 20 2.0 b</span> |
| <span class="sd"> 30 3.0 c</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> a c</span> |
| <span class="sd"> 10 4.0 d</span> |
| <span class="sd"> 11 5.0 e</span> |
| <span class="sd"> 12 6.0 f</span> |
| <span class="sd"> 20 NaN None</span> |
| <span class="sd"> 30 NaN None</span> |
| |
| <span class="sd"> Align only axis=1 (column):</span> |
| |
| <span class="sd"> >>> aligned_l, aligned_r = df1.align(df2, axis=1)</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 10 1 a NaN</span> |
| <span class="sd"> 20 2 b NaN</span> |
| <span class="sd"> 30 3 c NaN</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 10 4 NaN d</span> |
| <span class="sd"> 11 5 NaN e</span> |
| <span class="sd"> 12 6 NaN f</span> |
| |
| <span class="sd"> Align with the join type "inner":</span> |
| |
| <span class="sd"> >>> aligned_l, aligned_r = df1.align(df2, join="inner")</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 10 1</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 10 4</span> |
| |
| <span class="sd"> Align with a Series:</span> |
| |
| <span class="sd"> >>> s = ps.Series([7, 8, 9], index=[10, 11, 12])</span> |
| <span class="sd"> >>> aligned_l, aligned_r = df1.align(s, axis=0)</span> |
| <span class="sd"> >>> aligned_l.sort_index()</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 10 1.0 a</span> |
| <span class="sd"> 11 NaN None</span> |
| <span class="sd"> 12 NaN None</span> |
| <span class="sd"> 20 2.0 b</span> |
| <span class="sd"> 30 3.0 c</span> |
| <span class="sd"> >>> aligned_r.sort_index()</span> |
| <span class="sd"> 10 7.0</span> |
| <span class="sd"> 11 8.0</span> |
| <span class="sd"> 12 9.0</span> |
| <span class="sd"> 20 NaN</span> |
| <span class="sd"> 30 NaN</span> |
| <span class="sd"> dtype: float64</span> |
| |
| <span class="sd"> >>> ps.reset_option("compute.ops_on_diff_frames")</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unsupported type: </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| |
| <span class="n">how</span> <span class="o">=</span> <span class="n">validate_how</span><span class="p">(</span><span class="n">join</span><span class="p">)</span> |
| <span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| |
| <span class="n">right_is_series</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">right_is_series</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Must specify axis=0 or 1"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"align currently only works for axis=0 when right is Series"</span> |
| <span class="p">)</span> |
| |
| <span class="n">left</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">other</span> |
| |
| <span class="k">if</span> <span class="p">(</span><span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">):</span> |
| <span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">)</span> |
| <span class="n">left</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"this"</span><span class="p">]</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">"that"</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">right_is_series</span><span class="p">:</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">right</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span> |
| <span class="p">)</span> <span class="ow">and</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="o">!=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| |
| <span class="k">if</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">!=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"cannot join with no overlapping index names"</span><span class="p">)</span> |
| |
| <span class="n">left</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"full"</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span> |
| <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">|</span> <span class="nb">set</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"inner"</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span> |
| <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&</span> <span class="nb">set</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="n">how</span> <span class="o">==</span> <span class="s2">"left"</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">left</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">())</span> |
| <span class="n">left</span> <span class="o">=</span> <span class="n">left</span><span class="p">[</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">right</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">())</span> |
| <span class="n">right</span> <span class="o">=</span> <span class="n">right</span><span class="p">[</span><span class="n">column_labels</span><span class="p">]</span> |
| |
| <span class="k">return</span> <span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">copy</span><span class="p">(),</span> <span class="n">right</span><span class="o">.</span><span class="n">copy</span><span class="p">())</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">from_dict</span><span class="p">(</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">]],</span> |
| <span class="n">orient</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"columns"</span><span class="p">,</span> |
| <span class="n">dtype</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Construct DataFrame from dict of array-like or dicts.</span> |
| |
| <span class="sd"> Creates DataFrame object from dictionary by columns or by index</span> |
| <span class="sd"> allowing dtype specification.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : dict</span> |
| <span class="sd"> Of the form {field : array-like} or {field : dict}.</span> |
| <span class="sd"> orient : {'columns', 'index'}, default 'columns'</span> |
| <span class="sd"> The "orientation" of the data. If the keys of the passed dict</span> |
| <span class="sd"> should be the columns of the resulting DataFrame, pass 'columns'</span> |
| <span class="sd"> (default). Otherwise if the keys should be rows, pass 'index'.</span> |
| <span class="sd"> dtype : dtype, default None</span> |
| <span class="sd"> Data type to force, otherwise infer.</span> |
| <span class="sd"> columns : list, default None</span> |
| <span class="sd"> Column labels to use when ``orient='index'``. Raises a ValueError</span> |
| <span class="sd"> if used with ``orient='columns'``.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.from_records : DataFrame from structured ndarray, sequence</span> |
| <span class="sd"> of tuples or dicts, or DataFrame.</span> |
| <span class="sd"> DataFrame : DataFrame object creation using constructor.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> By default the keys of the dict become the DataFrame columns:</span> |
| |
| <span class="sd"> >>> data = {'col_1': [3, 2, 1, 0], 'col_2': [10, 20, 30, 40]}</span> |
| <span class="sd"> >>> ps.DataFrame.from_dict(data)</span> |
| <span class="sd"> col_1 col_2</span> |
| <span class="sd"> 0 3 10</span> |
| <span class="sd"> 1 2 20</span> |
| <span class="sd"> 2 1 30</span> |
| <span class="sd"> 3 0 40</span> |
| |
| <span class="sd"> Specify ``orient='index'`` to create the DataFrame using dictionary</span> |
| <span class="sd"> keys as rows:</span> |
| |
| <span class="sd"> >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 20, 30, 40]}</span> |
| <span class="sd"> >>> ps.DataFrame.from_dict(data, orient='index').sort_index()</span> |
| <span class="sd"> 0 1 2 3</span> |
| <span class="sd"> row_1 3 2 1 0</span> |
| <span class="sd"> row_2 10 20 30 40</span> |
| |
| <span class="sd"> When using the 'index' orientation, the column names can be</span> |
| <span class="sd"> specified manually:</span> |
| |
| <span class="sd"> >>> ps.DataFrame.from_dict(data, orient='index',</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D']).sort_index()</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> row_1 3 2 1 0</span> |
| <span class="sd"> row_2 10 20 30 40</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">from_dict</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="n">orient</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">))</span> |
| |
| <span class="c1"># Override the `groupby` to specify the actual return type annotation.</span> |
| <div class="viewcode-block" id="DataFrame.groupby"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.DataFrame.groupby.html#pyspark.pandas.DataFrame.groupby">[docs]</a> <span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">"Series"</span><span class="p">]]],</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrameGroupBy"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="s2">"DataFrameGroupBy"</span><span class="p">,</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| <span class="n">groupby</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">Frame</span><span class="o">.</span><span class="n">groupby</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="k">def</span> <span class="nf">_build_groupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrameGroupBy"</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">DataFrameGroupBy</span> |
| |
| <span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="o">.</span><span class="n">_build</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">by</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_to_internal_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a pandas DataFrame directly from _internal to avoid overhead of copy.</span> |
| |
| <span class="sd"> This method is for internal use only.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">to_pandas_frame</span> |
| |
| <span class="k">def</span> <span class="nf">_get_or_create_repr_pandas_cache</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_repr_pandas_cache"</span><span class="p">)</span> <span class="ow">or</span> <span class="n">n</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_repr_pandas_cache</span><span class="p">:</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="s2">"_repr_pandas_cache"</span><span class="p">,</span> <span class="p">{</span><span class="n">n</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">n</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()}</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_repr_pandas_cache</span><span class="p">[</span><span class="n">n</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="n">max_display_count</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"display.max_rows"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">max_display_count</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">to_string</span><span class="p">()</span> |
| |
| <span class="n">pdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_or_create_repr_pandas_cache</span><span class="p">(</span><span class="n">max_display_count</span><span class="p">)</span> |
| <span class="n">pdf_length</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:</span><span class="n">max_display_count</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">pdf_length</span> <span class="o">></span> <span class="n">max_display_count</span><span class="p">:</span> |
| <span class="n">repr_string</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_string</span><span class="p">(</span><span class="n">show_dimensions</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">match</span> <span class="o">=</span> <span class="n">REPR_PATTERN</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">repr_string</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">nrows</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="s2">"rows"</span><span class="p">)</span> |
| <span class="n">ncols</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="s2">"columns"</span><span class="p">)</span> |
| <span class="n">footer</span> <span class="o">=</span> <span class="s2">"</span><span class="se">\n\n</span><span class="s2">[Showing only the first </span><span class="si">{nrows}</span><span class="s2"> rows x </span><span class="si">{ncols}</span><span class="s2"> columns]"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">nrows</span><span class="o">=</span><span class="n">nrows</span><span class="p">,</span> <span class="n">ncols</span><span class="o">=</span><span class="n">ncols</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">REPR_PATTERN</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="n">footer</span><span class="p">,</span> <span class="n">repr_string</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_string</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">_repr_html_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="n">max_display_count</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"display.max_rows"</span><span class="p">)</span> |
| <span class="c1"># pandas 0.25.1 has a regression about HTML representation so 'bold_rows'</span> |
| <span class="c1"># has to be set as False explicitly. See https://github.com/pandas-dev/pandas/issues/28204</span> |
| <span class="n">bold_rows</span> <span class="o">=</span> <span class="ow">not</span> <span class="p">(</span><span class="n">LooseVersion</span><span class="p">(</span><span class="s2">"0.25.1"</span><span class="p">)</span> <span class="o">==</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">__version__</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">max_display_count</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">to_html</span><span class="p">(</span><span class="n">notebook</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">bold_rows</span><span class="o">=</span><span class="n">bold_rows</span><span class="p">)</span> |
| |
| <span class="n">pdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_or_create_repr_pandas_cache</span><span class="p">(</span><span class="n">max_display_count</span><span class="p">)</span> |
| <span class="n">pdf_length</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:</span><span class="n">max_display_count</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">pdf_length</span> <span class="o">></span> <span class="n">max_display_count</span><span class="p">:</span> |
| <span class="n">repr_html</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_html</span><span class="p">(</span><span class="n">show_dimensions</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">notebook</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">bold_rows</span><span class="o">=</span><span class="n">bold_rows</span><span class="p">)</span> |
| <span class="n">match</span> <span class="o">=</span> <span class="n">REPR_HTML_PATTERN</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">repr_html</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">nrows</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="s2">"rows"</span><span class="p">)</span> |
| <span class="n">ncols</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="s2">"columns"</span><span class="p">)</span> |
| <span class="n">by</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">215</span><span class="p">)</span> |
| <span class="n">footer</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="s2">"</span><span class="se">\n</span><span class="s2"><p>Showing only the first </span><span class="si">{rows}</span><span class="s2"> rows "</span> |
| <span class="s2">"</span><span class="si">{by}</span><span class="s2"> </span><span class="si">{cols}</span><span class="s2"> columns</p></span><span class="se">\n</span><span class="s2"></div>"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">rows</span><span class="o">=</span><span class="n">nrows</span><span class="p">,</span> <span class="n">by</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> <span class="n">cols</span><span class="o">=</span><span class="n">ncols</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">REPR_HTML_PATTERN</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="n">footer</span><span class="p">,</span> <span class="n">repr_html</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_html</span><span class="p">(</span><span class="n">notebook</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">bold_rows</span><span class="o">=</span><span class="n">bold_rows</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="n">key</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">"none key"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">key</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">bool</span><span class="p">)]</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">slice</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">n</span><span class="p">)</span> <span class="o">==</span> <span class="nb">int</span> <span class="ow">or</span> <span class="kc">None</span> <span class="k">for</span> <span class="n">n</span> <span class="ow">in</span> <span class="p">[</span><span class="n">key</span><span class="o">.</span><span class="n">start</span><span class="p">,</span> <span class="n">key</span><span class="o">.</span><span class="n">stop</span><span class="p">]):</span> |
| <span class="c1"># Seems like pandas Frame always uses int as positional search when slicing</span> |
| <span class="c1"># with ints.</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="n">key</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">key</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="nb">list</span><span class="p">(</span><span class="n">key</span><span class="p">)]</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__setitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">))</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="bp">self</span><span class="p">):</span> |
| <span class="c1"># Different Series or DataFrames</span> |
| <span class="n">level</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> |
| <span class="n">key</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">_index_normalized_label</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">_index_normalized_frame</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">assign_columns</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">that_column_labels</span><span class="p">)</span> |
| <span class="c1"># Note that here intentionally uses `zip_longest` that combine</span> |
| <span class="c1"># that_columns.</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">this_label</span><span class="p">,</span> <span class="n">that_label</span> <span class="ow">in</span> <span class="n">zip_longest</span><span class="p">(</span> |
| <span class="n">key</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">,</span> <span class="n">that_column_labels</span> |
| <span class="p">):</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">that_label</span><span class="p">),</span> <span class="nb">tuple</span><span class="p">([</span><span class="s2">"that"</span><span class="p">,</span> <span class="o">*</span><span class="n">k</span><span class="p">]))</span> |
| <span class="k">if</span> <span class="n">this_label</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">this_label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span> <span class="o">!=</span> <span class="n">k</span><span class="p">:</span> |
| <span class="k">yield</span> <span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">this_label</span><span class="p">),</span> <span class="n">this_label</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span><span class="n">assign_columns</span><span class="p">,</span> <span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">"left"</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">value</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"Length of values does not match length of index"</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: avoid using default index?</span> |
| <span class="k">with</span> <span class="n">option_context</span><span class="p">(</span> |
| <span class="s2">"compute.default_index_type"</span><span class="p">,</span> |
| <span class="s2">"distributed-sequence"</span><span class="p">,</span> |
| <span class="s2">"compute.ops_on_diff_frames"</span><span class="p">,</span> |
| <span class="kc">True</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">])</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> |
| |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span> |
| <span class="c1"># Same DataFrames.</span> |
| <span class="n">field_names</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">columns</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_assign</span><span class="p">({</span><span class="n">k</span><span class="p">:</span> <span class="n">value</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">field_names</span><span class="p">)})</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Same Series.</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_assign</span><span class="p">({</span><span class="n">key</span><span class="p">:</span> <span class="n">value</span><span class="p">})</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_update_internal_frame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="p">)</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_index_normalized_label</span><span class="p">(</span><span class="n">level</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">labels</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a label that is normalized against the current column index level.</span> |
| <span class="sd"> For example, the key "abc" can be ("abc", "", "") if the current Frame has</span> |
| <span class="sd"> a multi-index for its column</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">labels</span><span class="p">):</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">labels</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">labels</span><span class="p">):</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[(</span><span class="n">labels</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">k</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">k</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">k</span><span class="p">,)</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="n">level</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span> |
| <span class="s2">"Key length (</span><span class="si">{}</span><span class="s2">) exceeds index depth (</span><span class="si">{}</span><span class="s2">)"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="nb">max</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">),</span> <span class="n">level</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">([</span><span class="s2">""</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">level</span> <span class="o">-</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">))))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">]</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_index_normalized_frame</span><span class="p">(</span><span class="n">level</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">psser_or_psdf</span><span class="p">:</span> <span class="n">DataFrameOrSeries</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a frame that is normalized against the current column index level.</span> |
| <span class="sd"> For example, the name in `pd.Series([...], name="abc")` can be can be</span> |
| <span class="sd"> ("abc", "", "") if the current DataFrame has a multi-index for its column</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">),</span> <span class="nb">type</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">MultiIndex</span><span class="o">.</span><span class="n">from_tuples</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="nb">tuple</span><span class="p">([</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)]</span> <span class="o">+</span> <span class="p">([</span><span class="s2">""</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">key</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"__"</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">_MissingPandasLikeDataFrame</span><span class="p">,</span> <span class="n">key</span><span class="p">):</span> |
| <span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">_MissingPandasLikeDataFrame</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="c1"># type: ignore</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="n">key</span><span class="p">]</span> |
| <span class="k">except</span> <span class="ne">KeyError</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span> |
| <span class="s2">"'</span><span class="si">%s</span><span class="s2">' object has no attribute '</span><span class="si">%s</span><span class="s2">'"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__setattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__getattribute__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> |
| <span class="k">return</span> <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">AttributeError</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="k">if</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="bp">self</span><span class="p">[</span><span class="n">key</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">msg</span> <span class="o">=</span> <span class="s2">"pandas-on-Spark doesn't allow columns to be created via a new attribute name"</span> |
| <span class="k">if</span> <span class="n">is_testing</span><span class="p">():</span> |
| <span class="k">raise</span> <span class="ne">AssertionError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="n">msg</span><span class="p">,</span> <span class="ne">UserWarning</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">count</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="fm">__dir__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="n">fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">f</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">schema</span><span class="o">.</span><span class="n">fieldNames</span><span class="p">()</span> <span class="k">if</span> <span class="s2">" "</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">f</span> |
| <span class="p">]</span> |
| <span class="k">return</span> <span class="nb">list</span><span class="p">(</span><span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__dir__</span><span class="p">())</span> <span class="o">+</span> <span class="n">fields</span> |
| |
| <span class="k">def</span> <span class="fm">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Name</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="nb">iter</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> |
| |
| <span class="c1"># NDArray Compat</span> |
| <span class="k">def</span> <span class="nf">__array_ufunc__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="c1"># TODO: is it possible to deduplicate it with '_map_series_op'?</span> |
| <span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="k">for</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">inputs</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="n">inputs</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">inputs</span> |
| <span class="p">):</span> |
| <span class="c1"># binary only</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span> |
| <span class="n">this</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">that</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">this</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">!=</span> <span class="n">that</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"cannot join with no overlapping index names"</span><span class="p">)</span> |
| |
| <span class="c1"># Different DataFrames</span> |
| <span class="k">def</span> <span class="nf">apply_op</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="s2">"Series"</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span> |
| <span class="k">for</span> <span class="n">this_label</span><span class="p">,</span> <span class="n">that_label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">this_column_labels</span><span class="p">,</span> <span class="n">that_column_labels</span><span class="p">):</span> |
| <span class="k">yield</span> <span class="p">(</span> |
| <span class="n">ufunc</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">this_label</span><span class="p">),</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">that_label</span><span class="p">),</span> <span class="o">**</span><span class="n">kwargs</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">this_label</span><span class="p">),</span> |
| <span class="n">this_label</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">align_diff_frames</span><span class="p">(</span><span class="n">apply_op</span><span class="p">,</span> <span class="n">this</span><span class="p">,</span> <span class="n">that</span><span class="p">,</span> <span class="n">fillna</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">"full"</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># DataFrame and Series</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">this</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="k">assert</span> <span class="nb">all</span><span class="p">(</span><span class="n">inp</span> <span class="ow">is</span> <span class="n">this</span> <span class="k">for</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">inputs</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">))</span> |
| |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">this</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">arguments</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">inp</span> <span class="ow">in</span> <span class="n">inputs</span><span class="p">:</span> |
| <span class="n">arguments</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">inp</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">inp</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="k">else</span> <span class="n">inp</span><span class="p">)</span> |
| <span class="c1"># both binary and unary.</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">ufunc</span><span class="p">(</span><span class="o">*</span><span class="n">arguments</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">this</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">sys</span><span class="o">.</span><span class="n">version_info</span> <span class="o">>=</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">7</span><span class="p">):</span> |
| |
| <span class="k">def</span> <span class="nf">__class_getitem__</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">params</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">object</span><span class="p">:</span> |
| <span class="c1"># This is a workaround to support variadic generic in DataFrame in Python 3.7.</span> |
| <span class="c1"># See https://github.com/python/typing/issues/193</span> |
| <span class="c1"># we always wraps the given type hints by a tuple to mimic the variadic generic.</span> |
| <span class="k">return</span> <span class="n">_create_tuple_for_frame_type</span><span class="p">(</span><span class="n">params</span><span class="p">)</span> |
| |
| <span class="k">elif</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">5</span><span class="p">)</span> <span class="o"><=</span> <span class="n">sys</span><span class="o">.</span><span class="n">version_info</span> <span class="o"><</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mi">7</span><span class="p">):</span> |
| <span class="c1"># This is a workaround to support variadic generic in DataFrame in Python 3.5+</span> |
| <span class="c1"># The implementation is in its metaclass so this flag is needed to distinguish</span> |
| <span class="c1"># pandas-on-Spark DataFrame.</span> |
| <span class="n">is_dataframe</span> <span class="o">=</span> <span class="kc">None</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_reduce_spark_multi</span><span class="p">(</span><span class="n">sdf</span><span class="p">:</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">aggs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Performs a reduction on a spark DataFrame, the functions being known sql aggregate functions.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SparkDataFrame</span><span class="p">)</span> |
| <span class="n">sdf0</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">aggs</span><span class="p">)</span> |
| <span class="n">l</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">sdf0</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">())</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">l</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">l</span><span class="p">)</span> |
| <span class="n">row</span> <span class="o">=</span> <span class="n">l</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">l2</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> |
| <span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">l2</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">aggs</span><span class="p">),</span> <span class="p">(</span><span class="n">row</span><span class="p">,</span> <span class="n">l2</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">l2</span> |
| |
| |
| <span class="k">class</span> <span class="nc">CachedDataFrame</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Cached pandas-on-Spark DataFrame, which corresponds to pandas DataFrame logically, but</span> |
| <span class="sd"> internally it caches the corresponding Spark DataFrame.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">internal</span><span class="p">:</span> <span class="n">InternalFrame</span><span class="p">,</span> <span class="n">storage_level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">StorageLevel</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">storage_level</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_cached"</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">cache</span><span class="p">())</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">storage_level</span><span class="p">,</span> <span class="n">StorageLevel</span><span class="p">):</span> |
| <span class="nb">object</span><span class="o">.</span><span class="fm">__setattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_cached"</span><span class="p">,</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">persist</span><span class="p">(</span><span class="n">storage_level</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Only a valid pyspark.StorageLevel type is acceptable for the `storage_level`"</span> |
| <span class="p">)</span> |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"CachedDataFrame"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| |
| <span class="k">def</span> <span class="fm">__exit__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">exception_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="ne">BaseException</span><span class="p">]],</span> |
| <span class="n">exception_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="ne">BaseException</span><span class="p">],</span> |
| <span class="n">traceback</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">TracebackType</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">unpersist</span><span class="p">()</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| |
| <span class="c1"># create accessor for Spark related methods.</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">CachedAccessor</span><span class="p">(</span><span class="s2">"spark"</span><span class="p">,</span> <span class="n">CachedSparkFrameMethods</span><span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">shutil</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">tempfile</span> |
| <span class="kn">import</span> <span class="nn">uuid</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.pandas.frame</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"ps"</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"pyspark.pandas.frame tests"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| |
| <span class="n">db_name</span> <span class="o">=</span> <span class="s2">"db</span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">"-"</span><span class="p">,</span> <span class="s2">""</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"CREATE DATABASE </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"db"</span><span class="p">]</span> <span class="o">=</span> <span class="n">db_name</span> |
| |
| <span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"path"</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span> |
| |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">frame</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">"DROP DATABASE IF EXISTS </span><span class="si">%s</span><span class="s2"> CASCADE"</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |